From 51fac37bb20c9440a9a4e0a20846c139364d6d13 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Thu, 25 Apr 2024 04:54:52 +0200 Subject: Adding upstream version 255.5. Signed-off-by: Daniel Baumann --- docs/API_FILE_SYSTEMS.md | 38 ++- docs/ARCHITECTURE.md | 214 ++++++--------- docs/BACKPORTS.md | 16 +- docs/BOOT.md | 10 +- docs/BOOT_LOADER_INTERFACE.md | 2 +- docs/BUILDING_IMAGES.md | 139 +++++----- docs/CATALOG.md | 63 +++-- docs/CGROUP_DELEGATION.md | 4 +- docs/CODE_OF_CONDUCT.md | 10 +- docs/CODE_QUALITY.md | 2 +- docs/CODING_STYLE.md | 2 +- docs/CONTAINER_INTERFACE.md | 7 +- docs/CONTRIBUTING.md | 42 +-- docs/CONTROL_GROUP_INTERFACE.md | 20 +- docs/CONVERTING_TO_HOMED.md | 123 +++++---- docs/COREDUMP.md | 84 +++--- docs/CREDENTIALS.md | 4 +- docs/DAEMON_SOCKET_ACTIVATION.md | 25 +- docs/DEBUGGING.md | 133 +++++++--- docs/DESKTOP_ENVIRONMENTS.md | 28 +- docs/DISTRO_PORTING.md | 34 ++- docs/ELF_PACKAGE_METADATA.md | 4 +- docs/FAQ.md | 11 +- docs/GROUP_RECORD.md | 61 ++--- docs/HACKING.md | 12 +- docs/HOME_DIRECTORY.md | 188 +++++++------ docs/INCOMPATIBILITIES.md | 8 +- docs/INHIBITOR_LOCKS.md | 74 +++++- docs/INITRD_INTERFACE.md | 9 +- docs/JOURNAL_EXPORT_FORMATS.md | 40 ++- docs/JOURNAL_FILE_FORMAT.md | 90 +++---- docs/MINIMAL_BUILDS.md | 19 +- docs/MY_SERVICE_CANT_GET_REATLIME.md | 49 +++- docs/OPTIMIZATIONS.md | 133 ++++++++-- docs/PASSWORD_AGENTS.md | 43 ++- docs/PAX_CONTROL_GROUPS.md | 117 ++++++++ docs/PORTABILITY_AND_STABILITY.md | 108 +++++--- docs/PREDICTABLE_INTERFACE_NAMES.md | 42 ++- docs/PRESET.md | 31 ++- docs/RANDOM_SEEDS.md | 338 +++++++++++------------- docs/ROOT_STORAGE_DAEMONS.md | 4 +- docs/SECURITY.md | 8 +- docs/SEPARATE_USR_IS_BROKEN.md | 75 +++++- docs/SYSLOG.md | 48 +++- docs/SYSTEMD_FILE_HIERARCHY_REQUIREMENTS.md | 30 ++- docs/THE_CASE_FOR_THE_USR_MERGE.md | 2 +- docs/TIPS_AND_TRICKS.md | 16 +- docs/UIDS-GIDS.md | 299 ++++++++++----------- docs/USERDB_AND_DESKTOPS.md | 150 +++++------ docs/USER_GROUP_API.md | 217 +++++++-------- docs/USER_NAMES.md | 91 +++---- docs/USER_RECORD.md | 8 +- docs/VIRTUALIZED_TESTING.md | 23 +- docs/WRITING_DESKTOP_ENVIRONMENTS.md | 41 ++- docs/WRITING_DISPLAY_MANAGERS.md | 48 +++- docs/WRITING_NETWORK_CONFIGURATION_MANAGERS.md | 62 +++-- docs/WRITING_RESOLVER_CLIENTS.md | 52 ++-- docs/WRITING_VM_AND_CONTAINER_MANAGERS.md | 53 +++- docs/assets/f17boot.png | Bin 0 -> 5807 bytes man/common-variables.xml | 8 +- man/crypttab.xml | 4 +- man/custom-entities.ent.in | 2 +- man/custom-html.xsl | 6 +- man/daemon.xml | 4 +- man/event-quick-child.c | 2 +- man/hwdb-usb-device.c | 6 +- man/journalctl.xml | 2 +- man/kernel-command-line.xml | 22 ++ man/notify-selfcontained-example.c | 173 ++++++++++++ man/org.freedesktop.resolve1.xml | 39 ++- man/path-documents.c | 2 +- man/portablectl.xml | 6 +- man/repart.d.xml | 4 +- man/sd_bus_error-example.c | 2 +- man/sd_event_add_io.xml | 24 +- man/sd_journal_get_cursor.xml | 15 +- man/sd_journal_seek_head.xml | 10 +- man/sd_notify.xml | 5 + man/systemctl.xml | 68 +++-- man/systemd-bless-boot.service.xml | 2 +- man/systemd-bsod.service.xml | 2 +- man/systemd-cryptsetup.xml | 2 +- man/systemd-hibernate-resume-generator.xml | 11 +- man/systemd-hibernate-resume.service.xml | 2 +- man/systemd-journald.service.xml | 18 ++ man/systemd-nspawn.xml | 2 +- man/systemd-pcrlock.xml | 4 +- man/systemd-poweroff.service.xml | 2 +- man/systemd-repart.xml | 2 +- man/systemd-socket-proxyd.xml | 5 + man/systemd-soft-reboot.service.xml | 2 +- man/systemd-storagetm.service.xml | 2 +- man/systemd-sysext.xml | 2 +- man/systemd-timedated.service.xml | 4 +- man/systemd-tpm2-setup.service.xml | 2 +- man/systemd-vmspawn.xml | 4 +- man/systemd.exec.xml | 29 +- man/systemd.network.xml | 12 +- man/systemd.pcrlock.xml | 10 +- man/systemd.resource-control.xml | 16 +- man/systemd.service.xml | 14 +- man/systemd.socket.xml | 2 +- man/ukify.xml | 10 +- man/varlinkctl.xml | 8 +- meson.build | 6 +- meson_options.txt | 4 +- mkosi.images/base/mkosi.build.chroot | 6 - mkosi.images/base/mkosi.conf.d/10-opensuse.conf | 1 - po/POTFILES.skip | 3 + shell-completion/zsh/_journalctl | 6 +- shell-completion/zsh/_udevadm | 2 +- src/analyze/analyze-srk.c | 4 +- src/backlight/backlight.c | 4 + src/basic/chase.c | 30 +-- src/basic/chattr-util.c | 2 +- src/basic/env-util.c | 31 ++- src/basic/env-util.h | 2 + src/basic/filesystems-gperf.gperf | 1 + src/basic/fs-util.c | 8 +- src/basic/fs-util.h | 10 +- src/basic/hashmap.h | 2 +- src/basic/lock-util.c | 14 +- src/basic/log.c | 10 +- src/basic/meson.build | 2 +- src/basic/missing_magic.h | 5 + src/basic/os-util.c | 55 +++- src/basic/os-util.h | 1 + src/basic/stat-util.c | 26 +- src/basic/stat-util.h | 3 + src/basic/terminal-util.c | 2 +- src/basic/terminal-util.h | 3 +- src/basic/virt.c | 1 + src/boot/efi/boot.c | 6 +- src/boot/efi/cpio.c | 2 +- src/boot/efi/meson.build | 5 + src/boot/efi/stub.c | 8 +- src/busctl/busctl.c | 9 + src/core/bpf-socket-bind.c | 9 + src/core/bpf/socket_bind/socket-bind-api.bpf.h | 7 +- src/core/bpf/socket_bind/socket-bind.bpf.c | 3 + src/core/dynamic-user.c | 4 +- src/core/exec-invoke.c | 4 +- src/core/main.c | 2 +- src/core/manager-serialize.c | 3 + src/core/mount.c | 89 ++++--- src/core/scope.c | 2 + src/core/service.c | 18 +- src/core/show-status.c | 15 +- src/core/socket.c | 5 +- src/core/swap.c | 4 +- src/cryptenroll/cryptenroll-tpm2.c | 4 +- src/cryptsetup/cryptsetup-tokens/luks2-tpm2.c | 4 +- src/cryptsetup/cryptsetup-tokens/luks2-tpm2.h | 2 +- src/cryptsetup/cryptsetup-tpm2.c | 4 +- src/cryptsetup/cryptsetup.c | 4 +- src/dissect/dissect.c | 3 +- src/home/homed-manager.c | 2 +- src/home/homework-cifs.c | 62 ++--- src/journal-remote/journal-gatewayd.c | 1 + src/journal-remote/journal-remote-main.c | 22 +- src/journal-remote/journal-remote.c | 6 +- src/journal-remote/journal-remote.h | 2 +- src/journal-remote/journal-upload.c | 40 +-- src/journal-remote/journal-upload.h | 1 - src/journal/cat.c | 10 +- src/journal/journalctl.c | 14 +- src/kernel-install/60-ukify.install.in | 6 + src/kernel-install/90-uki-copy.install | 4 +- src/libsystemd-network/dhcp-option.c | 55 +++- src/libsystemd-network/dhcp-option.h | 1 + src/libsystemd-network/sd-dhcp-lease.c | 10 +- src/libsystemd-network/sd-dhcp-server.c | 14 +- src/libsystemd/sd-bus/bus-error.c | 6 +- src/libsystemd/sd-device/device-private.h | 5 +- src/libsystemd/sd-device/sd-device.c | 4 +- src/libsystemd/sd-event/sd-event.c | 19 +- src/libsystemd/sd-event/test-event.c | 18 ++ src/libsystemd/sd-id128/id128-util.c | 4 +- src/libsystemd/sd-journal/journal-file.c | 11 +- src/libsystemd/sd-journal/journal-verify.c | 2 +- src/libsystemd/sd-journal/sd-journal.c | 2 +- src/login/logind-dbus.c | 2 +- src/network/networkd-state-file.c | 4 +- src/network/tc/qdisc.c | 6 + src/network/tc/tclass.c | 6 + src/partition/repart.c | 4 +- src/pcrextend/pcrextend.c | 2 +- src/pcrlock/pcrlock.c | 8 +- src/portable/portable.c | 88 ++---- src/resolve/resolved-bus.c | 10 +- src/resolve/resolved-dns-cache.c | 16 ++ src/resolve/resolved-dns-query.c | 27 +- src/resolve/resolved-dns-rr.c | 17 ++ src/resolve/resolved-dns-rr.h | 1 + src/resolve/resolved-dns-scope.c | 61 +++-- src/resolve/resolved-dns-scope.h | 1 + src/resolve/resolved-dns-stream.c | 41 +++ src/resolve/resolved-dns-stream.h | 1 + src/resolve/resolved-dns-stub.c | 10 +- src/resolve/resolved-dns-synthesize.c | 2 +- src/resolve/resolved-dns-transaction.c | 193 +++++++------- src/resolve/resolved-dns-transaction.h | 5 + src/resolve/resolved-dns-trust-anchor.c | 5 + src/rpm/macros.systemd.in | 1 + src/shared/base-filesystem.c | 4 +- src/shared/blockdev-util.c | 56 ++-- src/shared/bpf-dlopen.c | 11 +- src/shared/btrfs-util.c | 10 +- src/shared/copy.c | 123 ++++++--- src/shared/copy.h | 1 + src/shared/creds-util.c | 4 +- src/shared/data-fd-util.h | 2 +- src/shared/dlfcn-util.c | 2 + src/shared/hibernate-util.c | 8 + src/shared/idn-util.c | 5 +- src/shared/install.c | 7 +- src/shared/journal-file-util.c | 15 +- src/shared/logs-show.c | 3 +- src/shared/loop-util.c | 8 +- src/shared/open-file.c | 2 +- src/shared/serialize.c | 55 +++- src/shared/tpm2-util.c | 40 ++- src/shared/tpm2-util.h | 1 + src/shared/verbs.c | 29 +- src/shared/watchdog.c | 16 +- src/systemctl/systemctl-logind.c | 2 +- src/systemctl/systemctl-show.c | 2 +- src/systemd/sd-bus-vtable.h | 8 +- src/test/test-btrfs.c | 2 +- src/test/test-copy.c | 26 +- src/test/test-fs-util.c | 46 ++-- src/test/test-open-file.c | 10 +- src/test/test-stat-util.c | 19 ++ src/tmpfiles/tmpfiles.c | 97 +++---- src/tpm2-setup/tpm2-setup.c | 17 +- src/ukify/ukify.py | 61 ++++- src/userdb/userdbctl.c | 6 +- src/userdb/userdbd-manager.c | 8 +- test/TEST-69-SHUTDOWN/test.sh | 1 + test/test-functions | 3 +- test/test-network/systemd-networkd-tests.py | 280 +++++++++++--------- test/test-shutdown.py | 22 +- test/units/testsuite-04.journal-corrupt.sh | 36 +++ test/units/testsuite-04.journal.sh | 16 +- test/units/testsuite-07.exec-context.sh | 2 + test/units/testsuite-29.sh | 13 + test/units/testsuite-45.sh | 12 +- test/units/testsuite-50.sh | 2 +- test/units/testsuite-72.sh | 4 +- test/units/testsuite-75.sh | 42 +-- tmpfiles.d/systemd.conf.in | 11 +- tools/check-efi-alignment.py | 32 +++ tools/elf2efi.py | 252 +++++++++--------- units/systemd-modules-load.service.in | 2 + units/systemd-tpm2-setup-early.service.in | 3 +- units/systemd-tpm2-setup.service.in | 2 +- 256 files changed, 4302 insertions(+), 2608 deletions(-) create mode 100644 docs/PAX_CONTROL_GROUPS.md create mode 100644 docs/assets/f17boot.png create mode 100644 man/notify-selfcontained-example.c create mode 100755 test/units/testsuite-04.journal-corrupt.sh create mode 100755 tools/check-efi-alignment.py diff --git a/docs/API_FILE_SYSTEMS.md b/docs/API_FILE_SYSTEMS.md index 84a1900..3ef9077 100644 --- a/docs/API_FILE_SYSTEMS.md +++ b/docs/API_FILE_SYSTEMS.md @@ -9,7 +9,13 @@ SPDX-License-Identifier: LGPL-2.1-or-later _So you are seeing all kinds of weird file systems in the output of mount(8) that are not listed in `/etc/fstab`, and you wonder what those are, how you can get rid of them, or at least change their mount options._ -The Linux kernel provides a number of different ways for userspace to communicate with it. For many facilities there are system calls, others are hidden behind Netlink interfaces, and even others are exposed via virtual file systems such as `/proc` or `/sys`. These file systems are programming interfaces, they are not actually backed by real, persistent storage. They simply use the file system interface of the kernel as interface to various unrelated mechanisms. Similarly, there are file systems that userspace uses for its own API purposes, to store shared memory segments, shared temporary files or sockets. In this article we want to discuss all these kind of _API file systems_. More specifically, here's a list of these file systems typical Linux systems currently have: +The Linux kernel provides a number of different ways for userspace to communicate with it. +For many facilities there are system calls, others are hidden behind Netlink interfaces, and even others are exposed via virtual file systems such as `/proc` or `/sys`. +These file systems are programming interfaces, they are not actually backed by real, persistent storage. +They simply use the file system interface of the kernel as interface to various unrelated mechanisms. +Similarly, there are file systems that userspace uses for its own API purposes, to store shared memory segments, shared temporary files or sockets. +In this article we want to discuss all these kind of _API file systems_. +More specifically, here's a list of these file systems typical Linux systems currently have: * `/sys` for exposing kernel devices, drivers and other kernel information to userspace * `/proc` for exposing kernel settings, processes and other kernel information to userspace @@ -27,26 +33,40 @@ The Linux kernel provides a number of different ways for userspace to communicat * `/sys/fs/fuse/connections` for exposing kernel FUSE connections to userspace (X) * `/sys/firmware/efi/efivars` for exposing firmware variables to userspace -All these _API file systems_ are mounted during very early boot-up of systemd and are generally not listed in `/etc/fstab`. Depending on the used kernel configuration some of these API file systems might not be available and others might exist instead. As these interfaces are important for kernel-to-userspace and userspace-to-userspace communication they are mounted automatically and without configuration or interference by the user. Disabling or changing their parameters might hence result in applications breaking as they can no longer access the interfaces they need. +All these _API file systems_ are mounted during very early boot-up of systemd and are generally not listed in `/etc/fstab`. +Depending on the used kernel configuration some of these API file systems might not be available and others might exist instead. +As these interfaces are important for kernel-to-userspace and userspace-to-userspace communication they are mounted automatically and without configuration or interference by the user. +Disabling or changing their parameters might hence result in applications breaking as they can no longer access the interfaces they need. Even though the default settings of these file systems should normally be suitable for most setups, in some cases it might make sense to change the mount options, or possibly even disable some of these file systems. -Even though normally none of these API file systems are listed in `/etc/fstab` they may be added there. If so, any options specified therein will be applied to that specific API file system. Hence: to alter the mount options or other parameters of these file systems, simply add them to `/etc/fstab` with the appropriate settings and you are done. Using this technique it is possible to change the source, type of a file system in addition to simply changing mount options. That is useful to turn `/tmp` to a true file system backed by a physical disk. +Even though normally none of these API file systems are listed in `/etc/fstab` they may be added there. +If so, any options specified therein will be applied to that specific API file system. +Hence: to alter the mount options or other parameters of these file systems, simply add them to `/etc/fstab` with the appropriate settings and you are done. +Using this technique it is possible to change the source, type of a file system in addition to simply changing mount options. +That is useful to turn `/tmp` to a true file system backed by a physical disk. -It is possible to disable the automatic mounting of some (but not all) of these file systems, if that is required. These are marked with (X) in the list above. You may disable them simply by masking them: +It is possible to disable the automatic mounting of some (but not all) of these file systems, if that is required. +These are marked with (X) in the list above. +You may disable them simply by masking them: ```sh systemctl mask dev-hugepages.mount ``` -This has the effect that the huge memory page API FS is not mounted by default, starting with the next boot. See [Three Levels of Off](http://0pointer.de/blog/projects/three-levels-of-off.html) for more information on masking. +This has the effect that the huge memory page API FS is not mounted by default, starting with the next boot. +See [Three Levels of Off](http://0pointer.de/blog/projects/three-levels-of-off.html) for more information on masking. -The systemd service [systemd-remount-fs.service](http://www.freedesktop.org/software/systemd/man/systemd-remount-fs.service.html) is responsible for applying mount parameters from `/etc/fstab` to the actual mounts. +The systemd service [systemd-remount-fs.service](http://www.freedesktop.org/software/systemd/man/systemd-remount-fs.service.html) +is responsible for applying mount parameters from `/etc/fstab` to the actual mounts. ## Why are you telling me all this? I just want to get rid of the tmpfs backed /tmp! You have three options: -1. Disable any mounting on `/tmp` so that it resides on the same physical file system as the root directory. For that, execute `systemctl mask tmp.mount` -2. Mount a different, physical file system to `/tmp`. For that, simply create an entry for it in `/etc/fstab` as you would do for any other file system. -3. Keep `/tmp` but increase/decrease the size of it. For that, also just create an entry for it in `/etc/fstab` as you would do for any other `tmpfs` file system, and use the right `size=` option. +1. Disable any mounting on `/tmp` so that it resides on the same physical file system as the root directory. + For that, execute `systemctl mask tmp.mount` +2. Mount a different, physical file system to `/tmp`. + For that, simply create an entry for it in `/etc/fstab` as you would do for any other file system. +3. Keep `/tmp` but increase/decrease the size of it. + For that, also just create an entry for it in `/etc/fstab` as you would do for any other `tmpfs` file system, and use the right `size=` option. diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 1478ea0..d046f27 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -9,50 +9,37 @@ SPDX-License-Identifier: LGPL-2.1-or-later ## Code Map -This document provides a high-level overview of the various components of the -systemd repository. +This document provides a high-level overview of the various components of the systemd repository. ## Source Code -Directories in `src/` provide the implementation of all daemons, libraries and -command-line tools shipped by the project. There are many, and more are -constantly added, so we will not enumerate them all here — the directory -names are self-explanatory. +Directories in `src/` provide the implementation of all daemons, libraries and command-line tools shipped by the project. +There are many, and more are constantly added, so we will not enumerate them all here — the directory names are self-explanatory. ### Shared Code -The code that is shared between components is split into a few directories, -each with a different purpose: - -- `src/basic/` and `src/fundamental/` — those directories contain code - primitives that are used by all other code. `src/fundamental/` is stricter, - because it used for EFI and user-space code, while `src/basic/` is only used - for user-space code. The code in `src/fundamental/` cannot depend on any - other code in the tree, and `src/basic/` can depend only on itself and - `src/fundamental/`. For user-space, a static library is built from this code - and linked statically in various places. - -- `src/libsystemd/` implements the `libsystemd.so` shared library (also - available as static `libsystemd.a`). This code may use anything in - `src/basic/` or `src/fundamental/`. - -- `src/shared/` provides various utilities and code shared between other - components that is exposed as the `libsystemd-shared-.so` shared library. - -The other subdirectories implement individual components. They may depend only -on `src/fundamental/` + `src/basic/`, or also on `src/libsystemd/`, or also on -`src/shared/`. - -You might wonder what kind of code belongs where. In general, the rule is that -code should be linked as few times as possible, ideally only once. Thus code that -is used by "higher-level" components (e.g. our binaries which are linked to -`libsystemd-shared-.so`), would go to a subdirectory specific to that -component if it is only used there. If the code is to be shared between -components, it'd go to `src/shared/`. Shared code that is used by multiple -components that do not link to `libsystemd-shared-.so` may live either in -`src/libsystemd/`, `src/basic/`, or `src/fundamental/`. Any code that is used -only for EFI goes under `src/boot/efi/`, and `src/fundamental/` if is shared -with non-EFI compoenents. +The code that is shared between components is split into a few directories, each with a different purpose: + +- `src/basic/` and `src/fundamental/` — those directories contain code primitives that are used by all other code. + `src/fundamental/` is stricter, because it used for EFI and user-space code, while `src/basic/` is only used for user-space code. + The code in `src/fundamental/` cannot depend on any other code in the tree, and `src/basic/` can depend only on itself and `src/fundamental/`. + For user-space, a static library is built from this code and linked statically in various places. + +- `src/libsystemd/` implements the `libsystemd.so` shared library (also available as static `libsystemd.a`). + This code may use anything in `src/basic/` or `src/fundamental/`. + +- `src/shared/` provides various utilities and code shared between other components that is exposed as the `libsystemd-shared-.so` shared library. + +The other subdirectories implement individual components. +They may depend only on `src/fundamental/` + `src/basic/`, or also on `src/libsystemd/`, or also on `src/shared/`. + +You might wonder what kind of code belongs where. +In general, the rule is that code should be linked as few times as possible, ideally only once. +Thus code that is used by "higher-level" components (e.g. our binaries which are linked to `libsystemd-shared-.so`), +would go to a subdirectory specific to that component if it is only used there. +If the code is to be shared between components, it'd go to `src/shared/`. +Shared code that is used by multiple components that do not link to `libsystemd-shared-.so` may live either in `src/libsystemd/`, `src/basic/`, or `src/fundamental/`. +Any code that is used only for EFI goes under `src/boot/efi/`, and `src/fundamental/` if is shared with non-EFI compoenents. To summarize: @@ -66,157 +53,118 @@ To summarize: `src/libsystemd/` - may be used by all code in the tree that links to `libsystem.so` -- may not use any code outside of `src/fundamental/`, `src/basic/`, and - `src/libsystemd/` +- may not use any code outside of `src/fundamental/`, `src/basic/`, and `src/libsystemd/` `src/shared/` -- may be used by all code in the tree, except for code in `src/basic/`, - `src/libsystemd/`, `src/nss-*`, `src/login/pam_systemd.*`, and files under - `src/journal/` that end up in `libjournal-client.a` convenience library. -- may not use any code outside of `src/fundamental/`, `src/basic/`, - `src/libsystemd/`, `src/shared/` +- may be used by all code in the tree, except for code in `src/basic/`, `src/libsystemd/`, `src/nss-*`, `src/login/pam_systemd.*`, + and files under `src/journal/` that end up in `libjournal-client.a` convenience library. +- may not use any code outside of `src/fundamental/`, `src/basic/`, `src/libsystemd/`, `src/shared/` ### PID 1 -Code located in `src/core/` implements the main logic of the systemd system (and user) -service manager. +Code located in `src/core/` implements the main logic of the systemd system (and user) service manager. BPF helpers written in C and used by PID 1 can be found under `src/core/bpf/`. #### Implementing Unit Settings -The system and session manager supports a large number of unit settings. These can generally -be configured in three ways: +The system and session manager supports a large number of unit settings. +These can generally be configured in three ways: 1. Via textual, INI-style configuration files called *unit* *files* 2. Via D-Bus messages to the manager 3. Via the `systemd-run` and `systemctl set-property` commands -From a user's perspective, the third is a wrapper for the second. To implement a new unit -setting, it is necessary to support all three input methods: +From a user's perspective, the third is a wrapper for the second. +To implement a new unit setting, it is necessary to support all three input methods: -1. *unit* *files* are parsed in `src/core/load-fragment.c`, with many simple and fixed-type -unit settings being parsed by common helpers, with the definition in the generator file -`src/core/load-fragment-gperf.gperf.in` +1. *unit* *files* are parsed in `src/core/load-fragment.c`, with many simple and fixed-type unit settings being parsed by common helpers, with the definition in the generator file `src/core/load-fragment-gperf.gperf.in` 2. D-Bus messages are defined and parsed in `src/core/dbus-*.c` -3. `systemd-run` and `systemctl set-property` do client-side parsing and translation into -D-Bus messages in `src/shared/bus-unit-util.c` +3. `systemd-run` and `systemctl set-property` do client-side parsing and translation into D-Bus messages in `src/shared/bus-unit-util.c` -So that they are exercised by the fuzzing CI, new unit settings should also be listed in the -text files under `test/fuzz/fuzz-unit-file/`. +So that they are exercised by the fuzzing CI, new unit settings should also be listed in the text files under `test/fuzz/fuzz-unit-file/`. ### systemd-udev -Sources for the udev daemon and command-line tool (single binary) can be found under -`src/udev/`. +Sources for the udev daemon and command-line tool (single binary) can be found under `src/udev/`. ### Unit Tests -Source files found under `src/test/` implement unit-level testing, mostly for -modules found in `src/basic/` and `src/shared/`, but not exclusively. Each test -file is compiled in a standalone binary that can be run to exercise the -corresponding module. While most of the tests can be run by any user, some -require privileges, and will attempt to clearly log about what they need -(mostly in the form of effective capabilities). These tests are self-contained, -and generally safe to run on the host without side effects. +Source files found under `src/test/` implement unit-level testing, mostly for modules found in `src/basic/` and `src/shared/`, but not exclusively. +Each test file is compiled in a standalone binary that can be run to exercise the corresponding module. +While most of the tests can be run by any user, some require privileges, and will attempt to clearly log about what they need (mostly in the form of effective capabilities). +These tests are self-contained, and generally safe to run on the host without side effects. -Ideally, every module in `src/basic/` and `src/shared/` should have a -corresponding unit test under `src/test/`, exercising every helper function. +Ideally, every module in `src/basic/` and `src/shared/` should have a corresponding unit test under `src/test/`, exercising every helper function. ### Fuzzing -Fuzzers are a type of unit tests that execute code on an externally-supplied -input sample. Fuzzers are called `fuzz-*`. Fuzzers for `src/basic/` and -`src/shared` live under `src/fuzz/`, and those for other parts of the codebase -should be located next to the code they test. - -Files under `test/fuzz/` contain input data for fuzzers, one subdirectory for -each fuzzer. Some of the files are "seed corpora", i.e. files that contain -lists of settings and input values intended to generate initial coverage, and -other files are samples saved by the fuzzing engines when they find an issue. - -When adding new input samples under `test/fuzz/*/`, please use some -short-but-meaningful names. Names of meson tests include the input file name -and output looks awkward if they are too long. - -Fuzzers are invoked primarily in three ways: firstly, each fuzzer is compiled -as a normal executable and executed for each of the input samples under -`test/fuzz/` as part of the test suite. Secondly, fuzzers may be instrumented -with sanitizers and invoked as part of the test suite (if `-Dfuzz-tests=true` -is configured). Thirdly, fuzzers are executed through fuzzing engines that try -to find new "interesting" inputs through coverage feedback and massive -parallelization; see the links for oss-fuzz in [Code quality](CODE_QUALITY). -For testing and debugging, fuzzers can be executed as any other program, -including under `valgrind` or `gdb`. +Fuzzers are a type of unit tests that execute code on an externally-supplied input sample. +Fuzzers are called `fuzz-*`. +Fuzzers for `src/basic/` and `src/shared` live under `src/fuzz/`, and those for other parts of the codebase should be located next to the code they test. + +Files under `test/fuzz/` contain input data for fuzzers, one subdirectory for each fuzzer. +Some of the files are "seed corpora", i.e. files that contain lists of settings and input values intended to generate initial coverage, and other files are samples saved by the fuzzing engines when they find an issue. + +When adding new input samples under `test/fuzz/*/`, please use some short-but-meaningful names. +Names of meson tests include the input file name and output looks awkward if they are too long. + +Fuzzers are invoked primarily in three ways: +firstly, each fuzzer is compiled as a normal executable and executed for each of the input samples under `test/fuzz/` as part of the test suite. +Secondly, fuzzers may be instrumented with sanitizers and invoked as part of the test suite (if `-Dfuzz-tests=true` is configured). +Thirdly, fuzzers are executed through fuzzing engines that tryto find new "interesting" inputs through coverage feedback and massive parallelization; see the links for oss-fuzz in [Code quality](/CODE_QUALITY). +For testing and debugging, fuzzers can be executed as any other program, including under `valgrind` or `gdb`. ## Integration Tests -Sources in `test/TEST-*` implement system-level testing for executables, -libraries and daemons that are shipped by the project. They require privileges -to run, and are not safe to execute directly on a host. By default they will -build an image and run the test under it via `qemu` or `systemd-nspawn`. +Sources in `test/TEST-*` implement system-level testing for executables, libraries and daemons that are shipped by the project. +They require privileges to run, and are not safe to execute directly on a host. +By default they will build an image and run the test under it via `qemu` or `systemd-nspawn`. -Most of those tests should be able to run via `systemd-nspawn`, which is -orders-of-magnitude faster than `qemu`, but some tests require privileged -operations like using `dm-crypt` or `loopdev`. They are clearly marked if that -is the case. +Most of those tests should be able to run via `systemd-nspawn`, which is orders-of-magnitude faster than `qemu`, but some tests require privileged operations like using `dm-crypt` or `loopdev`. +They are clearly marked if that is the case. -See `test/README.testsuite` for more specific details. +See [`test/README.testsuite`](https://github.com/systemd/systemd/blob/main/test/README.testsuite) for more specific details. ## hwdb -Rules built in the static hardware database shipped by the project can be found -under `hwdb.d/`. Some of these files are updated automatically, some are filled -by contributors. +Rules built in the static hardware database shipped by the project can be found under `hwdb.d/`. +Some of these files are updated automatically, some are filled by contributors. ## Documentation ### systemd.io -Markdown files found under `docs/` are automatically published on the -[systemd.io](https://systemd.io) website using Github Pages. A minimal unit test -to ensure the formatting doesn't have errors is included in the -`meson test -C build/ github-pages` run as part of the CI. +Markdown files found under `docs/` are automatically published on the [systemd.io](https://systemd.io) website using Github Pages. +A minimal unit test to ensure the formatting doesn't have errors is included in the `meson test -C build/ github-pages` run as part of the CI. ### Man pages -Manpages for binaries and libraries, and the DBUS interfaces, can be found under -`man/` and should ideally be kept in sync with changes to the corresponding -binaries and libraries. +Manpages for binaries and libraries, and the DBUS interfaces, can be found under `man/` and should ideally be kept in sync with changes to the corresponding binaries and libraries. ### Translations -Translations files for binaries and daemons, provided by volunteers, can be found -under `po/` in the usual format. They are kept up to date by contributors and by -automated tools. +Translations files for binaries and daemons, provided by volunteers, can be found under `po/` in the usual format. +They are kept up to date by contributors and by automated tools. ## System Configuration files and presets -Presets (or templates from which they are generated) for various daemons and tools -can be found under various directories such as `factory/`, `modprobe.d/`, `network/`, -`presets/`, `rules.d/`, `shell-completion/`, `sysctl.d/`, `sysusers.d/`, `tmpfiles.d/`. +Presets (or templates from which they are generated) for various daemons and tools can be found under various directories such as +`factory/`, `modprobe.d/`, `network/`, `presets/`, `rules.d/`, `shell-completion/`, `sysctl.d/`, `sysusers.d/`, `tmpfiles.d/`. ## Utilities for Developers -`tools/`, `coccinelle/`, `.github/`, `.semaphore/`, `.mkosi/` host various -utilities and scripts that are used by maintainers and developers. They are not -shipped or installed. +`tools/`, `coccinelle/`, `.github/`, `.semaphore/`, `.mkosi/` host various utilities and scripts that are used by maintainers and developers. +They are not shipped or installed. # Service Manager Overview -The Service Manager takes configuration in the form of unit files, credentials, -kernel command line options and D-Bus commands, and based on those manages the -system and spawns other processes. It runs in system mode as PID1, and in user -mode with one instance per user session. - -When starting a unit requires forking a new process, configuration for the new -process will be serialized and passed over to the new process, created via a -posix_spawn() call. This is done in order to avoid excessive processing after -a fork() but before an exec(), which is against glibc's best practices and can -also result in a copy-on-write trap. The new process will start as the -`systemd-executor` binary, which will deserialize the configuration and apply -all the options (sandboxing, namespacing, cgroup, etc.) before exec'ing the -configured executable. +The Service Manager takes configuration in the form of unit files, credentials, kernel command line options and D-Bus commands, and based on those manages the system and spawns other processes. +It runs in system mode as PID1, and in user mode with one instance per user session. + +When starting a unit requires forking a new process, configuration for the new process will be serialized and passed over to the new process, created via a posix_spawn() call. +This is done in order to avoid excessive processing after a fork() but before an exec(), which is against glibc's best practices and can also result in a copy-on-write trap. +The new process will start as the `systemd-executor` binary, which will deserialize the configuration and apply all the options (sandboxing, namespacing, cgroup, etc.) before exec'ing the configured executable. ``` ┌──────┐posix_spawn() ┌───────────┐execve() ┌────────┐ diff --git a/docs/BACKPORTS.md b/docs/BACKPORTS.md index 6fbb57d..b2aba23 100644 --- a/docs/BACKPORTS.md +++ b/docs/BACKPORTS.md @@ -7,13 +7,22 @@ SPDX-License-Identifier: LGPL-2.1-or-later # Backports -The upstream systemd git repo at [https://github.com/systemd/systemd](https://github.com/systemd/systemd) only contains the main systemd branch that progresses at a quick pace, continuously bringing both bugfixes and new features. Distributions usually prefer basing their releases on stabilized versions branched off from this, that receive the bugfixes but not the features. +The upstream systemd git repo at [https://github.com/systemd/systemd](https://github.com/systemd/systemd) only contains the main systemd branch that progresses at a quick pace, continuously bringing both bugfixes and new features. + +Distributions usually prefer basing their releases on stabilized versions branched off from this, that receive the bugfixes but not the features. ## Stable Branch Repository Stable branches are available from [https://github.com/systemd/systemd-stable](https://github.com/systemd/systemd-stable). -Stable branches are started for certain releases of systemd and named after them, e.g. v208-stable. Stable branches are typically managed by distribution maintainers on an as needed basis. For example v208 has been chosen for stable as several distributions are shipping this version and the official/upstream cycle of v208-v209 was a long one due to kdbus work. If you are using a particular version and find yourself backporting several patches, you may consider pushing a stable branch here for that version so others can benefit. Please contact us if you are interested. +Stable branches are started for certain releases of systemd and named after them, e.g. v208-stable. +Stable branches are typically managed by distribution maintainers on an as needed basis. + +For example v208 has been chosen for stable as several distributions are shipping this version and the official/upstream cycle of v208-v209 was a long one due to kdbus work. + +If you are using a particular version and find yourself backporting several patches, you may consider pushing a stable branch here for that version so others can benefit. + +Please contact us if you are interested. The following types of commits are cherry-picked onto those branches: @@ -22,4 +31,5 @@ The following types of commits are cherry-picked onto those branches: * hardware database additions, especially the keymap updates * small non-conflicting features deemed safe to add in a stable release -Please try to ensure that anything backported to the stable repository is done with the `git cherry-pick -x` option such that text stating the original SHA1 is added into the commit message. This makes it easier to check where the code came from (as sometimes it is necessary to add small fixes as new code due to the upstream refactors that are deemed too invasive to backport as a stable patch. +Please try to ensure that anything backported to the stable repository is done with the `git cherry-pick -x` option such that text stating the original SHA1 is added into the commit message. +This makes it easier to check where the code came from (as sometimes it is necessary to add small fixes as new code due to the upstream refactors) that are deemed too invasive to backport as a stable patch. diff --git a/docs/BOOT.md b/docs/BOOT.md index 574cc08..5590e6a 100644 --- a/docs/BOOT.md +++ b/docs/BOOT.md @@ -9,11 +9,15 @@ SPDX-License-Identifier: LGPL-2.1-or-later systemd-boot is a UEFI boot manager which executes configured EFI images. The default entry is selected by a configured pattern (glob) or an on-screen menu. -systemd-boot operates on the EFI System Partition (ESP) only. Configuration file fragments, kernels, initrds, other EFI images need to reside on the ESP. Linux kernels need to be built with CONFIG\_EFI\_STUB to be able to be directly executed as an EFI image. +systemd-boot operates on the EFI System Partition (ESP) only. Configuration file fragments, kernels, initrds, other EFI images need to reside on the ESP. + +Linux kernels need to be built with CONFIG\_EFI\_STUB to be able to be directly executed as an EFI image. systemd-boot reads simple and entirely generic boot loader configuration files; one file per boot loader entry to select from. All files need to reside on the ESP. -Pressing the Space key (or most other keys actually work too) during bootup will show an on-screen menu with all configured loader entries to select from. Pressing Enter on the selected entry loads and starts the EFI image. +Pressing the Space key (or most other keys actually work too) during bootup will show an on-screen menu with all configured loader entries to select from. + +Pressing Enter on the selected entry loads and starts the EFI image. If no timeout is configured, which is the default setting, and no key pressed during bootup, the default entry is executed right away. @@ -108,4 +112,4 @@ Links: [https://github.com/systemd/systemd](https://github.com/systemd/systemd) -[http://www.freedesktop.org/wiki/Specifications/BootLoaderSpec/](http://www.freedesktop.org/wiki/Specifications/BootLoaderSpec/) +[https://uapi-group.org/specifications/specs/boot_loader_specification/](https://uapi-group.org/specifications/specs/boot_loader_specification/) diff --git a/docs/BOOT_LOADER_INTERFACE.md b/docs/BOOT_LOADER_INTERFACE.md index a1f6b59..4bf074d 100644 --- a/docs/BOOT_LOADER_INTERFACE.md +++ b/docs/BOOT_LOADER_INTERFACE.md @@ -78,7 +78,7 @@ variables. All EFI variables use the vendor UUID * `1 << 1` → The boot loader honours `LoaderConfigTimeoutOneShot` when set. * `1 << 2` → The boot loader honours `LoaderEntryDefault` when set. * `1 << 3` → The boot loader honours `LoaderEntryOneShot` when set. - * `1 << 4` → The boot loader supports boot counting as described in [Automatic Boot Assessment](AUTOMATIC_BOOT_ASSESSMENT). + * `1 << 4` → The boot loader supports boot counting as described in [Automatic Boot Assessment](/AUTOMATIC_BOOT_ASSESSMENT). * `1 << 5` → The boot loader supports looking for boot menu entries in the Extended Boot Loader Partition. * `1 << 6` → The boot loader supports passing a random seed to the OS. * `1 << 13` → The boot loader honours `menu-disabled` option when set. diff --git a/docs/BUILDING_IMAGES.md b/docs/BUILDING_IMAGES.md index b11afa3..dcae4bb 100644 --- a/docs/BUILDING_IMAGES.md +++ b/docs/BUILDING_IMAGES.md @@ -9,12 +9,14 @@ SPDX-License-Identifier: LGPL-2.1-or-later In many scenarios OS installations are shipped as pre-built images, that require no further installation process beyond simple `dd`-ing the image to -disk and booting it up. When building such "golden" OS images for +disk and booting it up. +When building such "golden" OS images for `systemd`-based OSes a few points should be taken into account. Most of the points described here are implemented by the [`mkosi`](https://github.com/systemd/mkosi) OS image builder developed and -maintained by the systemd project. If you are using or working on another image +maintained by the systemd project. +If you are using or working on another image builder it's recommended to keep the following concepts and recommendations in mind. @@ -24,28 +26,25 @@ Typically the same OS image shall be deployable in multiple instances, and each instance should automatically acquire its own identifying credentials on first boot. For that it's essential to: -1. Remove the - [`/etc/machine-id`](https://www.freedesktop.org/software/systemd/man/machine-id.html) - file or write the string `uninitialized\n` into it. This file is supposed to - carry a 128-bit identifier unique to the system. Only when it is reset it - will be auto-generated on first boot and thus be truly unique. If this file - is not reset, and carries a valid ID every instance of the system will come +1. Remove the [`/etc/machine-id`](https://www.freedesktop.org/software/systemd/man/machine-id.html) + file or write the string `uninitialized\n` into it. + This file is supposed to carry a 128-bit identifier unique to the system. + Only when it is reset it will be auto-generated on first boot and thus be truly unique. + If this file is not reset, and carries a valid ID every instance of the system will come up with the same ID and that will likely lead to problems sooner or later, - as many network-visible identifiers are commonly derived from the machine - ID, for example, IPv6 addresses or transient MAC addresses. + as many network-visible identifiers are commonly derived from the machine ID, + for example, IPv6 addresses or transient MAC addresses. -2. Remove the `/var/lib/systemd/random-seed` file (see +2. Remove the `/var/lib/systemd/random-seed` file(see [`systemd-random-seed(8)`](https://www.freedesktop.org/software/systemd/man/systemd-random-seed.service.html)), - which is used to seed the kernel's random pool on boot. If this file is - shipped pre-initialized, every instance will seed its random pool with the + which is used to seed the kernel's random pool on boot. + If this file is shipped pre-initialized, every instance will seed its random pool with the same random data that is included in the image, and thus possibly generate - random data that is more similar to other instances booted off the same - image than advisable. + random data that is more similar to other instances booted off the same image than advisable. 3. Remove the `/loader/random-seed` file (see [`systemd-boot(7)`](https://www.freedesktop.org/software/systemd/man/systemd-boot.html)) - from the UEFI System Partition (ESP), in case the `systemd-boot` boot loader - is used in the image. + from the UEFI System Partition (ESP), in case the `systemd-boot` boot loader is used in the image. 4. It might also make sense to remove [`/etc/hostname`](https://www.freedesktop.org/software/systemd/man/hostname.html) @@ -69,24 +68,25 @@ The logic used to generate [Boot Loader Specification Type #1](https://uapi-group.org/specifications/specs/boot_loader_specification/#type-1-boot-loader-specification-entries) entries by default uses the machine ID as stored in `/etc/machine-id` for -naming boot menu entries and the directories in the ESP to place kernel images -in. This is done in order to allow multiple installations of the same OS on the +naming boot menu entries and the directories in the ESP to place kernel images in. +This is done in order to allow multiple installations of the same OS on the same system without conflicts. However, this is problematic if the machine ID shall be generated automatically on first boot: if the ID is not known before the first boot it cannot be used to name the most basic resources required for the boot process to complete. Thus, for images that shall acquire their identity on first boot only, it is -required to use a different identifier for naming boot menu entries. To allow -this the `kernel-install` logic knows the generalized *entry* *token* concept, +required to use a different identifier for naming boot menu entries. +To allow this the `kernel-install` logic knows the generalized *entry* *token* concept, which can be a freely chosen string to use for identifying the boot menu -resources of the OS. If not configured explicitly it defaults to the machine -ID. The file `/etc/kernel/entry-token` may be used to configure this string -explicitly. Thus, golden image builders should write a suitable identifier into +resources of the OS. +If not configured explicitly it defaults to the machineID. +The file `/etc/kernel/entry-token` may be used to configure this string explicitly. +Thus, golden image builders should write a suitable identifier into this file, for example, the `IMAGE_ID=` or `ID=` field from [`/etc/os-release`](https://www.freedesktop.org/software/systemd/man/os-release.html) -(also see below). It is recommended to do this before the `kernel-install` -functionality is invoked (i.e. before the package manager is used to install +(also see below). +It is recommended to do this before the `kernel-install` functionality is invoked (i.e. before the package manager is used to install packages into the OS tree being prepared), so that the selected string is automatically used for all entries to be generated. @@ -94,16 +94,16 @@ automatically used for all entries to be generated. `systemd` is designed to be able to come up safely and robustly if the `/var/` file system or even the entire root file system (with exception of `/usr/`, -i.e. the vendor OS resources) is empty (i.e. "unpopulated"). With this in mind -it's relatively easy to build images that only ship a `/usr/` tree, and +i.e. the vendor OS resources) is empty (i.e. "unpopulated"). +With this in mind it's relatively easy to build images that only ship a `/usr/` tree, and otherwise carry no other data, populating the rest of the directory hierarchy on first boot as needed. Specifically, the following mechanisms are in place: 1. The `switch-root` logic in systemd, that is used to switch from the initrd - phase to the host will create the basic OS hierarchy skeleton if missing. It - will create a couple of directories strictly necessary to boot up + phase to the host will create the basic OS hierarchy skeleton if missing. + It will create a couple of directories strictly necessary to boot up successfully, plus essential symlinks (such as those necessary for the dynamic loader `ld.so` to function). @@ -136,14 +136,18 @@ Specifically, the following mechanisms are in place: remains resolvable, even without `/etc/hosts` around. With these mechanisms the hierarchies below `/var/` and `/etc/` can be safely -and robustly populated on first boot, so that the OS can safely boot up. Note -that some auxiliary package are not prepared to operate correctly if their +and robustly populated on first boot, so that the OS can safely boot up. +Note that some auxiliary package are not prepared to operate correctly if their configuration data in `/etc/` or their state directories in `/var/` are -missing. This can typically be addressed via `systemd-tmpfiles` lines that -ensure the missing files and directories are created if missing. In particular, -configuration files that are necessary for operation can be automatically +missing. + +This can typically be addressed via `systemd-tmpfiles` lines that +ensure the missing files and directories are created if missing. +In particular, configuration files that are necessary for operation can be automatically copied or symlinked from the `/usr/share/factory/etc/` tree via the `C` or `L` -line types. That said, we recommend that all packages safely fall back to +line types. + +That said, we recommend that all packages safely fall back to internal defaults if their configuration is missing, making such additional steps unnecessary. @@ -156,17 +160,17 @@ manual work might be required to make this scenario work. Typically, if an image is `dd`-ed onto a target disk it will be minimal: i.e. only consist of necessary vendor data, and lack "payload" data, that shall -be individual to the system, and dependent on host parameters. On first boot, -the OS should take possession of the backing storage as necessary, dynamically +be individual to the system, and dependent on host parameters. +On first boot, the OS should take possession of the backing storage as necessary, dynamically using available space. Specifically: 1. Additional partitions should be created, that make no sense to ship - pre-built in the image. For example, `/tmp/` or `/home/` partitions, or even - `/var/` or the root file system (see above). + pre-built in the image. + For example, `/tmp/` or `/home/` partitions, or even `/var/` or the root file system (see above). 2. Additional partitions should be created that shall function as A/B - secondaries for partitions shipped in the original image. In other words: if - the `/usr/` file system shall be updated in an A/B fashion it typically + secondaries for partitions shipped in the original image. + In other words: if the `/usr/` file system shall be updated in an A/B fashion it typically makes sense to ship the original A file system in the deployed image, but create the B partition on first boot. @@ -191,10 +195,10 @@ it, then format it. 1. The [`systemd-repart(8)`](https://www.freedesktop.org/software/systemd/man/systemd-repart.service.html) component may manipulate GPT partition tables automatically on boot, growing - partitions or adding in partitions taking the backing storage size into - account. It can also encrypt partitions automatically it creates (even bind - to TPM2, automatically) and populate partitions from various sources. It - does this all in a robust fashion so that aborted invocations will not leave + partitions or adding in partitions taking the backing storage size into account. + It can also encrypt partitions automatically it creates (even bind + to TPM2, automatically) and populate partitions from various sources. + It does this all in a robust fashion so that aborted invocations will not leave incompletely set up partitions around. 2. The @@ -215,8 +219,8 @@ it, then format it. While a lot of work has gone into ensuring `systemd` systems can safely boot with unpopulated `/etc/` trees, it sometimes is desirable to set a couple of -basic settings *after* `dd`-ing the image to disk, but *before* first boot. For -this the tool +basic settings *after* `dd`-ing the image to disk, but *before* first boot. +For this the tool [`systemd-firstboot(1)`](https://www.freedesktop.org/software/systemd/man/systemd-firstboot.html) can be useful, with its `--image=` switch. It may be used to set very basic settings, such as the root password or hostname on an OS disk image or @@ -225,36 +229,33 @@ installed block device. ## Distinguishing First Boot For various purposes it's useful to be able to distinguish the first boot-up of -the system from later boot-ups (for example, to set up TPM hardware -specifically, or register a system somewhere). `systemd` provides mechanisms to -implement that. Specifically, the `ConditionFirstBoot=` and `AssertFirstBoot=` -settings may be used to conditionalize units to only run on first boot. See -[`systemd.unit(5)`](https://www.freedesktop.org/software/systemd/man/systemd.unit.html#ConditionFirstBoot=) +the system from later boot-ups (for example, to set up TPM hardware specifically, or register a system somewhere). +`systemd` provides mechanisms to implement that. +Specifically, the `ConditionFirstBoot=` and `AssertFirstBoot=` settings may be used to conditionalize units to only run on first boot. +See [`systemd.unit(5)`](https://www.freedesktop.org/software/systemd/man/systemd.unit.html#ConditionFirstBoot=) for details. A special target unit `first-boot-complete.target` may be used as milestone to -safely handle first boots where the system is powered off too early: if the -first boot process is aborted before this target is reached, the following boot -process will be considered a first boot, too. Once the target is reached, -subsequent boots will not be considered first boots anymore, even if the boot -process is aborted immediately after. Thus, services that must complete fully -before a system shall be considered fully past the first boot should be ordered -before this target unit. +safely handle first boots where the system is powered off too early: +if the first boot process is aborted before this target is reached, the following boot +process will be considered a first boot, too. +Once the target is reached, subsequent boots will not be considered first boots anymore, even if the boot +process is aborted immediately after. +Thus, services that must complete fully before a system shall be considered fully past the first boot should be ordered before this target unit. Whether a system will come up in first boot state or not is derived from the -initialization status of `/etc/machine-id`: if the file already carries a valid -ID the system is already past the first boot. If it is not initialized yet it -is still considered in the first boot state. For details see -[`machine-id(5)`](https://www.freedesktop.org/software/systemd/man/machine-id.html). +initialization status of `/etc/machine-id`: +if the file already carries a valid ID the system is already past the first boot. +If it is not initialized yet it is still considered in the first boot state. +For details see [`machine-id(5)`](https://www.freedesktop.org/software/systemd/man/machine-id.html). ## Image Metadata Typically, when operating with golden disk images it is useful to be able to -identify them and their version. For this the two fields `IMAGE_ID=` and -`IMAGE_VERSION=` have been defined in -[`os-release(5)`](https://www.freedesktop.org/software/systemd/man/os-release.html). These -fields may be accessed from unit files and similar via the `%M` and `%A` -specifiers. +identify them and their version. +For this the two fields `IMAGE_ID=` and `IMAGE_VERSION=` have been defined in +[`os-release(5)`](https://www.freedesktop.org/software/systemd/man/os-release.html). +These fields may be accessed from unit files and similar via the `%M` and `%A` specifiers. Depending on how the images are put together it might make sense to leave the OS distribution's `os-release` file as is in `/usr/lib/os-release` but to diff --git a/docs/CATALOG.md b/docs/CATALOG.md index bcbf5b9..f700ff0 100644 --- a/docs/CATALOG.md +++ b/docs/CATALOG.md @@ -7,7 +7,9 @@ SPDX-License-Identifier: LGPL-2.1-or-later # Journal Message Catalogs -Starting with 196 systemd includes a message catalog system which allows augmentation on display of journal log messages with short explanation texts, keyed off the MESSAGE\_ID= field of the entry. Many important log messages generated by systemd itself have message catalog entries. External packages can easily provide catalog data for their own messages. +Starting with 196 systemd includes a message catalog system which allows augmentation on display of journal log messages with short explanation texts, keyed off the MESSAGE\_ID= field of the entry. +Many important log messages generated by systemd itself have message catalog entries. +External packages can easily provide catalog data for their own messages. The message catalog has a number of purposes: @@ -18,11 +20,15 @@ The message catalog has a number of purposes: ## Format -Message catalog source files are simple text files that follow an RFC822 inspired format. To get an understanding of the format [here's an example file](http://cgit.freedesktop.org/systemd/systemd/plain/catalog/systemd.catalog), which includes entries for many important messages systemd itself generates. On installation of a package that includes message catalogs all installed message catalog source files get compiled into a binary index, which is then used to look up catalog data. +Message catalog source files are simple text files that follow an RFC822 inspired format. +To get an understanding of the format [here's an example file](http://cgit.freedesktop.org/systemd/systemd/plain/catalog/systemd.catalog), which includes entries for many important messages systemd itself generates. +On installation of a package that includes message catalogs all installed message catalog source files get compiled into a binary index, which is then used to look up catalog data. -journalctl's `-x` command line parameter may be used to augment on display journal log messages with message catalog data when browsing. `journalctl --list-catalog` may be used to print a list of all known catalog entries. +journalctl's `-x` command line parameter may be used to augment on display journal log messages with message catalog data when browsing. +`journalctl --list-catalog` may be used to print a list of all known catalog entries. -To register additional catalog entries, packages may drop (text) catalog files into /usr/lib/systemd/catalog/ with a suffix of .catalog. The files are not accessed directly when needed, but need to be built into a binary index file with `journalctl --update-catalog`. +To register additional catalog entries, packages may drop (text) catalog files into /usr/lib/systemd/catalog/ with a suffix of .catalog. +The files are not accessed directly when needed, but need to be built into a binary index file with `journalctl --update-catalog`. Here's an example how a single catalog entry looks like in the text source format. Multiple of these may be listed one after the other per catalog source file: @@ -43,25 +49,50 @@ should be reported to its vendor as a bug. The text format of the .catalog files is as follows: -* Simple, UTF-8 text files, with usual line breaks at 76 chars. URLs and suchlike where line-breaks are undesirable may use longer lines. As catalog files need to be usable on text consoles it is essential that the 76 char line break rule is otherwise followed for human readable text. +* Simple, UTF-8 text files, with usual line breaks at 76 chars. + URLs and suchlike where line-breaks are undesirable may use longer lines. + As catalog files need to be usable on text consoles it is essential that the 76 char line break rule is otherwise followed for human readable text. * Lines starting with `#` are ignored, and may be used for comments. -* The files consist of a series of entries. For each message ID (in combination with a locale) only a single entry may be defined. Every entry consists of: - * A separator line beginning with `-- `, followed by a hexadecimal message ID formatted as lower case ASCII string. Optionally, the message ID may be suffixed by a space and a locale identifier, such as `de` or `fr\_FR`, if i10n is required. - * A series of entry headers, in RFC822-style but not supporting continuation lines. Some header fields may appear more than once per entry. The following header fields are currently known (but additional fields may be added later): + +* The files consist of a series of entries. + For each message ID (in combination with a locale) only a single entry may be defined. + Every entry consists of: + * A separator line beginning with `-- `, followed by a hexadecimal message ID formatted as lower case ASCII string. + Optionally, the message ID may be suffixed by a space and a locale identifier, such as `de` or `fr\_FR`, if i10n is required. + * A series of entry headers, in RFC822-style but not supporting continuation lines. + Some header fields may appear more than once per entry. + The following header fields are currently known (but additional fields may be added later): + * Subject: A short, one-line human readable description of the message - * Defined-By: Who defined this message. Usually a package name or suchlike - * Support: A URI for getting further support. This can be a web URL or a telephone number in the tel:// namespace + * Defined-By: Who defined this message. + + Usually a package name or suchlike + * Support: A URI for getting further support. + This can be a web URL or a telephone number in the tel:// namespace * Documentation: URIs for further user, administrator or developer documentation on the log entry. URIs should be listed in order of relevance, the most relevant documentation first. * An empty line - * The actual catalog entry payload, as human readable prose. Multiple paragraphs may be separated by empty lines. The prose should first describe the message and when it occurs, possibly followed by recommendations how to deal with the message and (if it is an error message) correct the problem at hand. This message text should be readable by users and administrators. Information for developers should be stored externally instead, and referenced via a Documentation= header field. -* When a catalog entry is printed on screen for a specific log entry simple variable replacements are applied. Journal field names enclosed in @ will be replaced by their values, if such a field is available in an entry. If such a field is not defined in an entry the enclosing @ will be dropped but the variable name is kept. See [systemd's own message catalog](http://cgit.freedesktop.org/systemd/systemd/plain/catalog/systemd.catalog) for a complete example for a catalog file. + * The actual catalog entry payload, as human readable prose. + Multiple paragraphs may be separated by empty lines. + The prose should first describe the message and when it occurs, possibly followed by recommendations how to deal with the message and (if it is an error message) correct the problem at hand. + This message text should be readable by users and administrators. + Information for developers should be stored externally instead, and referenced via a Documentation= header field. + +* When a catalog entry is printed on screen for a specific log entry simple variable replacements are applied. +Journal field names enclosed in @ will be replaced by their values, if such a field is available in an entry. +If such a field is not defined in an entry the enclosing @ will be dropped but the variable name is kept. +See [systemd's own message catalog](http://cgit.freedesktop.org/systemd/systemd/plain/catalog/systemd.catalog) for a complete example for a catalog file. ## Adding Message Catalog Support to Your Program -Note that the message catalog is only available for messages generated with the MESSAGE\_ID= journal meta data field, as this is need to find the right entry for a message. For more information on the MESSAGE\_ID= journal entry field see [systemd.journal-fields(7)](http://www.freedesktop.org/software/systemd/man/systemd.journal-fields.html). +Note that the message catalog is only available for messages generated with the MESSAGE\_ID= journal meta data field, as this is need to find the right entry for a message. +For more information on the MESSAGE\_ID= journal entry field see [systemd.journal-fields(7)](http://www.freedesktop.org/software/systemd/man/systemd.journal-fields.html). To add message catalog entries for log messages your application generates, please follow the following guidelines: -* Use the [native Journal logging APIs](http://0pointer.de/blog/projects/journal-submit.html) to generate your messages, and define message IDs for all messages you want to add catalog entries for. You may use `journalctl --new-id128` to allocate new message IDs. -* Write a catalog entry file for your messages and ship them in your package and install them to `/usr/lib/systemd/catalog/` (if you package your software with RPM use `%_journalcatalogdir`) -* Ensure that after installation of your application's RPM/DEB "`journalctl --update-catalog`" is executed, in order to update the binary catalog index. (if you package your software with RPM use the `%journal_catalog_update` macro to achieve that.) +* Use the [native Journal logging APIs](http://0pointer.de/blog/projects/journal-submit.html) + to generate your messages, and define message IDs for all messages you want to add catalog entries for. + You may use `journalctl --new-id128` to allocate new message IDs. +* Write a catalog entry file for your messages and ship them in your package and install them to `/usr/lib/systemd/catalog/` + (if you package your software with RPM use `%_journalcatalogdir`) +* Ensure that after installation of your application's RPM/DEB "`journalctl --update-catalog`" is executed, in order to update the binary catalog index. + (if you package your software with RPM use the `%journal_catalog_update` macro to achieve that.) diff --git a/docs/CGROUP_DELEGATION.md b/docs/CGROUP_DELEGATION.md index 4210a75..9e2e76c 100644 --- a/docs/CGROUP_DELEGATION.md +++ b/docs/CGROUP_DELEGATION.md @@ -29,8 +29,8 @@ This document then adds in the higher-level view from systemd. This document augments the existing documentation we already have: -* [The New Control Group Interfaces](https://www.freedesktop.org/wiki/Software/systemd/ControlGroupInterface) -* [Writing VM and Container Managers](https://www.freedesktop.org/wiki/Software/systemd/writing-vm-managers) +* [The New Control Group Interfaces](/CONTROL_GROUP_INTERFACE) +* [Writing VM and Container Managers](/WRITING_VM_AND_CONTAINER_MANAGERS) These wiki documents are not as up to date as they should be, currently, but the basic concepts still fully apply. You should read them too, if you do something diff --git a/docs/CODE_OF_CONDUCT.md b/docs/CODE_OF_CONDUCT.md index 8e5455d..08ccc30 100644 --- a/docs/CODE_OF_CONDUCT.md +++ b/docs/CODE_OF_CONDUCT.md @@ -7,7 +7,8 @@ SPDX-License-Identifier: LGPL-2.1-or-later # The systemd Community Conduct Guidelines -This document provides community guidelines for a safe, respectful, productive, and collaborative place for any person who is willing to contribute to systemd. It applies to all “collaborative spaces”, which is defined as community communications channels (such as mailing lists, submitted patches, commit comments, etc.). +This document provides community guidelines for a safe, respectful, productive, and collaborative place for any person who is willing to contribute to systemd. +It applies to all “collaborative spaces”, which is defined as community communications channels (such as mailing lists, submitted patches, commit comments, etc.). - Participants will be tolerant of opposing views. - Participants must ensure that their language and actions are free of personal attacks and disparaging personal remarks. @@ -16,6 +17,9 @@ This document provides community guidelines for a safe, respectful, productive, ## Enforcement -Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at systemd-conduct@googlegroups.com. This team currently consists of David Strauss <>, Ekaterina Gerasimova (Kat) <>, and Zbigniew Jędrzejewski-Szmek <>. In the unfortunate event that you wish to make a complaint against one of the members, you may instead contact any of the other members individually. +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at systemd-conduct@googlegroups.com. +This team currently consists of David Strauss <>, Ekaterina Gerasimova (Kat) <>, and Zbigniew Jędrzejewski-Szmek <>. +In the unfortunate event that you wish to make a complaint against one of the members, you may instead contact any of the other members individually. -All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. +All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. +The project team is obligated to maintain confidentiality with regard to the reporter of an incident. diff --git a/docs/CODE_QUALITY.md b/docs/CODE_QUALITY.md index 166b307..4caf38f 100644 --- a/docs/CODE_QUALITY.md +++ b/docs/CODE_QUALITY.md @@ -75,7 +75,7 @@ available functionality: 15. Each PR is automatically tested with [Address Sanitizer](https://clang.llvm.org/docs/AddressSanitizer.html) and [Undefined Behavior Sanitizer](https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html). - See [Testing systemd using sanitizers](TESTING_WITH_SANITIZERS) + See [Testing systemd using sanitizers](/TESTING_WITH_SANITIZERS) for more information. 16. Fossies provides [source code misspelling reports](https://fossies.org/features.html#codespell). diff --git a/docs/CODING_STYLE.md b/docs/CODING_STYLE.md index 6d6e549..b4e88c9 100644 --- a/docs/CODING_STYLE.md +++ b/docs/CODING_STYLE.md @@ -297,7 +297,7 @@ SPDX-License-Identifier: LGPL-2.1-or-later t.bar = "bazz"; ``` -- To implement an endless loop, use `for (;;)` rather than `while (1)`. The +- To implement an endless loop, use `for (;;)` rather than `while (1)`. The latter is a bit ugly anyway, since you probably really meant `while (true)`. To avoid the discussion what the right always-true expression for an infinite while loop is, our recommendation is to simply write it without any diff --git a/docs/CONTAINER_INTERFACE.md b/docs/CONTAINER_INTERFACE.md index 7fa8558..460cc67 100644 --- a/docs/CONTAINER_INTERFACE.md +++ b/docs/CONTAINER_INTERFACE.md @@ -7,8 +7,7 @@ SPDX-License-Identifier: LGPL-2.1-or-later # The Container Interface -Also consult [Writing Virtual Machine or Container -Managers](https://www.freedesktop.org/wiki/Software/systemd/writing-vm-managers). +Also consult [Writing Virtual Machine or Container Managers](/WRITING_VM_AND_CONTAINER_MANAGERS). systemd has a number of interfaces for interacting with container managers, when systemd is used inside of an OS container. If you work on a container @@ -121,7 +120,7 @@ manager, please consider supporting the following interfaces. variable's name you may only specify ptys, and not other types of ttys. Also you need to specify the pty itself, a symlink will not suffice. This is implemented in - [systemd-getty-generator(8)](https://www.freedesktop.org/software/systemd/man/systemd-getty-generator.html). + [systemd-getty-generator(8)](https://www.freedesktop.org/software/systemd/man/latest/systemd-getty-generator.html). Note that this variable should not include the pty that `/dev/console` maps to if it maps to one (see below). Example: if the container receives `container_ttys=pts/7 pts/8 pts/14` it will spawn three additional login @@ -131,7 +130,7 @@ manager, please consider supporting the following interfaces. running the container manager, if this is considered desirable, please parse the host's `/etc/os-release` and set a `$container_host_=` environment variable for the ID fields described by the [os-release - interface](https://www.freedesktop.org/software/systemd/man/os-release.html), eg: + interface](https://www.freedesktop.org/software/systemd/man/latest/os-release.html), eg: `$container_host_id=debian` `$container_host_build_id=2020-06-15` `$container_host_variant_id=server` diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index f599972..c247102 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -11,9 +11,13 @@ We welcome contributions from everyone. However, please follow the following gui ## Filing Issues -* We use [GitHub Issues](https://github.com/systemd/systemd/issues) **exclusively** for tracking **bugs** and **feature** **requests** (RFEs) of systemd. If you are looking for help, please try the forums of your distribution first, or [systemd-devel mailing list](https://lists.freedesktop.org/mailman/listinfo/systemd-devel) for general questions about systemd. -* We only track bugs in the **two** **most** **recently** **released** (non-rc) **versions** of systemd in the GitHub Issue tracker. If you are using an older version of systemd, please contact your distribution's bug tracker instead (see below). See [GitHub Release Page](https://github.com/systemd/systemd/releases) for the list of most recent releases. -* When filing a feature request issue (RFE), please always check first if the newest upstream version of systemd already implements the feature, and whether there's already an issue filed for your feature by someone else. +* We use [GitHub Issues](https://github.com/systemd/systemd/issues) **exclusively** for tracking **bugs** and **feature** **requests** (RFEs) of systemd. + If you are looking for help, please try the forums of your distribution first, or [systemd-devel mailing list](https://lists.freedesktop.org/mailman/listinfo/systemd-devel) for general questions about systemd. +* We only track bugs in the **two** **most** **recently** **released** (non-rc) **versions** of systemd in the GitHub Issue tracker. + If you are using an older version of systemd, please contact your distribution's bug tracker instead (see below). + See [GitHub Release Page](https://github.com/systemd/systemd/releases) for the list of most recent releases. +* When filing a feature request issue (RFE), please always check first if the newest upstream version of systemd already implements the feature, + and whether there's already an issue filed for your feature by someone else. * When filing an issue, specify the **systemd** **version** you are experiencing the issue with. Also, indicate which **distribution** you are using. * Please include an explanation how to reproduce the issue you are pointing out. @@ -29,21 +33,23 @@ For older versions that are still supported by your distribution please use resp ## Security vulnerability reports -See [reporting of security vulnerabilities](SECURITY). +See [reporting of security vulnerabilities](/SECURITY). ## Posting Pull Requests * Make sure to post PRs only relative to a recent tip of the `main` branch. -* Follow our [Coding Style](CODING_STYLE) when contributing code. This is a requirement for all code we merge. -* Please make sure to test your change before submitting the PR. See the [Hacking guide](HACKING) for details on how to do this. +* Follow our [Coding Style](/CODING_STYLE) when contributing code. This is a requirement for all code we merge. +* Please make sure to test your change before submitting the PR. See the [Hacking guide](/HACKING) for details on how to do this. * Make sure to run the test suite locally, before posting your PR. We use a CI system, meaning we don't even look at your PR if the build and tests don't pass. * If you need to update the code in an existing PR, force-push into the same branch, overriding old commits with new versions. -* After you have pushed a new version, add a comment explaining the latest changes. If you are a member of the systemd project on GitHub, remove the `reviewed/needs-rework`/`ci-fails/needs-rework`/`needs-rebase` labels. -* If you are copying existing code from another source (eg: a compat header), please make sure the license is compatible with `LGPL-2.1-or-later`. If the license is not `LGPL-2.1-or-later`, please add a note to [`LICENSES/README.md`](https://github.com/systemd/systemd/blob/main/LICENSES/README.md). -* If the pull request stalls without review, post a ping in a comment after some time has passed. We are always short on reviewer time, and pull requests which haven't seen any recent activity can be easily forgotten. -* Github will automatically add the `please-review` label when a pull request is opened or updated. If you need -more information after a review, you can comment `/please-review` on the pull request to have Github add the -`please-review` label to the pull request. +* After you have pushed a new version, add a comment explaining the latest changes. + If you are a member of the systemd project on GitHub, remove the `reviewed/needs-rework`/`ci-fails/needs-rework`/`needs-rebase` labels. +* If you are copying existing code from another source (eg: a compat header), please make sure the license is compatible with `LGPL-2.1-or-later`. + If the license is not `LGPL-2.1-or-later`, please add a note to [`LICENSES/README.md`](https://github.com/systemd/systemd/blob/main/LICENSES/README.md). +* If the pull request stalls without review, post a ping in a comment after some time has passed. + We are always short on reviewer time, and pull requests which haven't seen any recent activity can be easily forgotten. +* Github will automatically add the `please-review` label when a pull request is opened or updated. + If you need more information after a review, you can comment `/please-review` on the pull request to have Github add the `please-review` label to the pull request. ## Reviewing Pull Requests @@ -69,9 +75,9 @@ Thank you very much for your contributions! # Backward Compatibility And External Dependencies -We strive to keep backward compatibility where possible and reasonable. The following are general guidelines, not hard -rules, and case-by-case exceptions might be applied at the discretion of the maintainers. The current set of build-time -and runtime dependencies are documented in the [README](https://github.com/systemd/systemd/blob/main/README). +We strive to keep backward compatibility where possible and reasonable. +The following are general guidelines, not hard rules, and case-by-case exceptions might be applied at the discretion of the maintainers. +The current set of build-time and runtime dependencies are documented in the [README](https://github.com/systemd/systemd/blob/main/README). ## New features @@ -102,9 +108,9 @@ for existing functionality. ## `libsystemd.so` -`libsystemd.so` is a shared public library, so breaking ABI/API compatibility would create lot of work for everyone, and is not allowed. Instead, always add a new interface instead of modifying -the signature of an existing function. It is fine to mark an interface as deprecated to gently nudge users toward a newer one, -but support for the old one must be maintained. +`libsystemd.so` is a shared public library, so breaking ABI/API compatibility would create lot of work for everyone, and is not allowed. +Instead, always add a new interface instead of modifying the signature of an existing function. +It is fine to mark an interface as deprecated to gently nudge users toward a newer one, but support for the old one must be maintained. Symbol versioning and the compiler's deprecated attribute should be used when managing the lifetime of a public interface. ## `libudev.so` diff --git a/docs/CONTROL_GROUP_INTERFACE.md b/docs/CONTROL_GROUP_INTERFACE.md index 11dc6a3..c82a2c3 100644 --- a/docs/CONTROL_GROUP_INTERFACE.md +++ b/docs/CONTROL_GROUP_INTERFACE.md @@ -9,14 +9,24 @@ SPDX-License-Identifier: LGPL-2.1-or-later > _aka "I want to make use of kernel cgroups, how do I do this in the new world order?"_ -Starting with version 205 systemd provides a number of interfaces that may be used to create and manage labelled groups of processes for the purpose of monitoring and controlling them and their resource usage. This is built on top of the Linux kernel Control Groups ("cgroups") facility. Previously, the kernel's cgroups API was exposed directly as shared application API, following the rules of the [Pax Control Groups](http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/) document. However, the kernel cgroup interface has been reworked into an API that requires that each individual cgroup is managed by a single writer only. With this change the main cgroup tree becomes private property of that userspace component and is no longer a shared resource. On systemd systems PID 1 takes this role and hence needs to provide APIs for clients to take benefit of the control groups functionality of the kernel. Note that services running on systemd systems may manage their own subtrees of the cgroups tree, as long as they explicitly turn on delegation mode for them (see below). +Starting with version 205 systemd provides a number of interfaces that may be used to create and manage labelled groups of processes for the purpose of monitoring and controlling them and their resource usage. +This is built on top of the Linux kernel Control Groups ("cgroups") facility. + +Previously, the kernel's cgroups API was exposed directly as shared application API, following the rules of the [Pax Control Groups](/PAX_CONTROL_GROUPS) document. +However, the kernel cgroup interface has been reworked into an API that requires that each individual cgroup is managed by a single writer only. + +With this change the main cgroup tree becomes private property of that userspace component and is no longer a shared resource. + +On systemd systems PID 1 takes this role and hence needs to provide APIs for clients to take benefit of the control groups functionality of the kernel. + +Note that services running on systemd systems may manage their own subtrees of the cgroups tree, as long as they explicitly turn on delegation mode for them (see below). That means explicitly, that: 1. The root control group may only be written to by systemd (PID 1). Services that create and manipulate control groups in the top level cgroup are in direct conflict with the kernel's requirement that each control group should have a single-writer only. 2. Services must set Delegate=yes for the units they intend to manage subcgroups of. If they create and manipulate cgroups outside of units that have Delegate=yes set, they violate the access contract for control groups. -For a more high-level background story, please have a look at this [Linux Foundation News Story](http://www.linuxfoundation.org/news-media/blogs/browse/2013/08/all-about-linux-kernel-cgroup%E2%80%99s-redesign). +For a more high-level background story, please have a look at this [Linux Foundation News Story](https://www.linuxfoundation.jp/blog/2013/08/all-about-the-linux-kernel-cgroups-redesign/). ### Why this all again? @@ -46,7 +56,7 @@ On systemd systems use the systemd APIs as described below. At this time we are ### What's the timeframe of this? Do I need to care now? -In the short-term future writing directly to the control group tree from applications should still be OK, as long as the [Pax Control Groups](http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/) document is followed. In the medium-term future it will still be supported to alter/read individual attributes of cgroups directly, but no longer to create/delete cgroups without using the systemd API. In the longer-term future altering/reading attributes will also be unavailable to userspace applications, unless done via systemd's APIs (either D-Bus based IPC APIs or shared library APIs for _passive_ operations). +In the short-term future writing directly to the control group tree from applications should still be OK, as long as the [Pax Control Groups](/PAX_CONTROL_GROUPS) document is followed. In the medium-term future it will still be supported to alter/read individual attributes of cgroups directly, but no longer to create/delete cgroups without using the systemd API. In the longer-term future altering/reading attributes will also be unavailable to userspace applications, unless done via systemd's APIs (either D-Bus based IPC APIs or shared library APIs for _passive_ operations). It is recommended to use the new systemd APIs described below in any case. Note that the kernel cgroup interface is currently being reworked (available when the "sane_behaviour" kernel option is used). This will change the cgroupfs interface. By using systemd's APIs this change is abstracted away and invisible to applications. @@ -193,7 +203,7 @@ Most relevant APIs are exposed via D-Bus, however some _passive_ interfaces are ### Creating and Starting -To create and start a transient (scope, service or slice) unit in the cgroup tree use the `StartTransientUnit()` method on the `Manager` object exposed by systemd's PID 1 on the bus, see the [Bus API Documentation](http://www.freedesktop.org/wiki/Software/systemd/dbus/) for details. This call takes four arguments. The first argument is the full unit name you want this unit to be known under. This unit name is the handle to the unit, and is shown in the "systemctl" output and elsewhere. This name must be unique during runtime of the unit. You should generate a descriptive name for this that is useful for the administrator to make sense of it. The second parameter is the mode, and should usually be `replace` or `fail`. The third parameter contains an array of initial properties to set for the unit. It is an array of pairs of property names as string and values as variant. Note that this is an array and not a dictionary! This is that way in order to match the properties array of the `SetProperties()` call (see below). The fourth parameter is currently not used and should be passed as empty array. This call will first create the transient unit and then immediately queue a start job for it. This call returns an object path to a `Job` object for the start job of this unit. +To create and start a transient (scope, service or slice) unit in the cgroup tree use the `StartTransientUnit()` method on the `Manager` object exposed by systemd's PID 1 on the bus, see the [Bus API Documentation](https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.systemd1.html) for details. This call takes four arguments. The first argument is the full unit name you want this unit to be known under. This unit name is the handle to the unit, and is shown in the "systemctl" output and elsewhere. This name must be unique during runtime of the unit. You should generate a descriptive name for this that is useful for the administrator to make sense of it. The second parameter is the mode, and should usually be `replace` or `fail`. The third parameter contains an array of initial properties to set for the unit. It is an array of pairs of property names as string and values as variant. Note that this is an array and not a dictionary! This is that way in order to match the properties array of the `SetProperties()` call (see below). The fourth parameter is currently not used and should be passed as empty array. This call will first create the transient unit and then immediately queue a start job for it. This call returns an object path to a `Job` object for the start job of this unit. ### Properties @@ -209,7 +219,7 @@ To acquire a list of currently running units, use the `ListUnits()` call on the ### VM and Container Managers -Use these APIs to register any kind of process workload with systemd to be placed in a resource controlled cgroup. Note however that for containers and virtual machines it is better to use the [`machined`](http://www.freedesktop.org/wiki/Software/systemd/machined/) interfaces since they provide integration with "ps" and similar tools beyond what mere cgroup registration provides. Also see [Writing VM and Container Managers](http://www.freedesktop.org/wiki/Software/systemd/writing-vm-managers/) for details. +Use these APIs to register any kind of process workload with systemd to be placed in a resource controlled cgroup. Note however that for containers and virtual machines it is better to use the [`machined`](https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.machine1.html) interfaces since they provide integration with "ps" and similar tools beyond what mere cgroup registration provides. Also see [Writing VM and Container Managers](/WRITING_VM_AND_CONTAINER_MANAGERS) for details. ### Reading Accounting Information diff --git a/docs/CONVERTING_TO_HOMED.md b/docs/CONVERTING_TO_HOMED.md index 5416a22..a31ff5a 100644 --- a/docs/CONVERTING_TO_HOMED.md +++ b/docs/CONVERTING_TO_HOMED.md @@ -8,35 +8,36 @@ SPDX-License-Identifier: LGPL-2.1-or-later # Converting Existing Users to systemd-homed managed Users Traditionally on most Linux distributions, regular (human) users are managed -via entries in `/etc/passwd`, `/etc/shadow`, `/etc/group` and -`/etc/gshadow`. With the advent of +via entries in `/etc/passwd`, `/etc/shadow`, `/etc/group` and `/etc/gshadow`. +With the advent of [`systemd-homed`](https://www.freedesktop.org/software/systemd/man/systemd-homed.service.html) it might be desirable to convert an existing, traditional user account to a -`systemd-homed` managed one. Below is a brief guide how to do that. +`systemd-homed` managed one. +Below is a brief guide how to do that. Before continuing, please read up on these basic concepts: -* [Home Directories](HOME_DIRECTORY) -* [JSON User Records](USER_RECORD) -* [JSON Group Records](GROUP_RECORD) -* [User/Group Record Lookup API via Varlink](USER_GROUP_API) +* [Home Directories](/HOME_DIRECTORY) +* [JSON User Records](/USER_RECORD) +* [JSON Group Records](/GROUP_RECORD) +* [User/Group Record Lookup API via Varlink](/USER_GROUP_API) ## Caveat -This is a manual process, and possibly a bit fragile. Hence, do this at your -own risk, read up beforehand, and make a backup first. You know what's at -stake: your own home directory, i.e. all your personal data. +This is a manual process, and possibly a bit fragile. +Hence, do this at your own risk, read up beforehand, and make a backup first. +You know what's at stake: your own home directory, i.e. all your personal data. ## Step-By-Step Here's the step-by-step guide: 0. Preparations: make sure you run a distribution that has `systemd-homed` - enabled and properly set up, including the necessary PAM and NSS - configuration updates. Make sure you have enough disk space in `/home/` for - a (temporary) second copy of your home directory. Make sure to backup your - home directory. Make sure to log out of your user account fully. Then log in - as root on the console. + enabled and properly set up, including the necessary PAM and NSS configuration updates. + Make sure you have enough disk space in `/home/` for a (temporary) second copy of your home directory. + Make sure to backup your home directory. + Make sure to log out of your user account fully. + Then log in as root on the console. 1. Rename your existing home directory to something safe. Let's say your user ID is `foobar`. Then do: @@ -45,92 +46,86 @@ Here's the step-by-step guide: mv /home/foobar /home/foobar.saved ``` -2. Have a look at your existing user record, as stored in `/etc/passwd` and - related files. We want to use the same data for the new record, hence it's good - looking at the old data. Use commands such as: +2. Have a look at your existing user record, as stored in `/etc/passwd` and related files. + We want to use the same data for the new record, hence it's good looking at the old data. + + Use commands such as: ``` getent passwd foobar getent shadow foobar ``` - This will tell you the `/etc/passwd` and `/etc/shadow` entries for your - user. For details about the fields, see the respective man pages + This will tell you the `/etc/passwd` and `/etc/shadow` entries for your user. + For details about the fields, see the respective man pages [passwd(5)](https://man7.org/linux/man-pages/man5/passwd.5.html) and [shadow(5)](https://man7.org/linux/man-pages/man5/shadow.5.html). - The fourth field in the `getent passwd foobar` output tells you the GID of - your user's main group. Depending on your distribution it's a group private - to the user, or a group shared by most local, regular users. Let's say the - GID reported is 1000, let's then query its details: + The fourth field in the `getent passwd foobar` output tells you the GID of your user's main group. + Depending on your distribution it's a group private to the user, or a group shared by most local, regular users. + Let's say the GID reported is 1000, let's then query its details: ``` getent group 1000 ``` - This will tell you the name of that group. If the name is the same as your - user name your distribution apparently provided you with a private group for - your user. If it doesn't match (and is something like `users`) it apparently - didn't. Note that `systemd-homed` will always manage a private group for - each user under the same name, hence if your distribution is one of the - latter kind, then there's a (minor) mismatch in structure when converting. + This will tell you the name of that group. + If the name is the same as your user name your distribution apparently provided you with a private group for your user. + If it doesn't match (and is something like `users`) it apparently didn't. + Note that `systemd-homed` will always manage a private group for each user under the same name, + hence if your distribution is one of the latter kind, then there's a (minor) mismatch in structure when converting. - Save the information reported by these three commands somewhere, for later - reference. + Save the information reported by these three commands somewhere, for later reference. 3. Now edit your `/etc/passwd` file and remove your existing record - (i.e. delete a single line, the one of your user's account, leaving all - other lines unmodified). Similar for `/etc/shadow`, `/etc/group` (in case - you have a private group for your user) and `/etc/gshadow`. Most - distributions provide you with a tool for that, that adds safe + (i.e. delete a single line, the one of your user's account, leaving all other lines unmodified). + Similar for `/etc/shadow`, `/etc/group` (in case you have a private group for your user) and `/etc/gshadow`. + Most distributions provide you with a tool for that, that adds safe synchronization for these changes: `vipw`, `vipw -s`, `vigr` and `vigr -s`. 4. At this point the old user account vanished, while the home directory still - exists safely under the `/home/foobar.saved` name. Let's now create a new - account with `systemd-homed`, using the same username and UID as before: + exists safely under the `/home/foobar.saved` name. + Let's now create a new account with `systemd-homed`, using the same username and UID as before: - ``` - homectl create foobar --uid=$UID --real-name=$GECOS - ``` + ```sh + homectl create foobar --uid=$UID --real-name=$GECOS + ``` In this command line, replace `$UID` by the UID you previously used, - i.e. the third field of the `getent passwd foobar` output above. Similar, - replace `$GECOS` by the GECOS field of your old account, i.e the fifth field - of the old output. If your distribution traditionally does not assign a - private group to regular user groups, then consider adding `--member-of=` - with the group name to get a modicum of compatibility with the status quo - ante: this way your new user account will still not have the old primary + i.e. the third field of the `getent passwd foobar` output above. + Similar, replace `$GECOS` by the GECOS field of your old account, i.e the fifth field of the old output. + If your distribution traditionally does not assign a private group to regular user groups, + then consider adding `--member-of=` with the group name to get a modicum of compatibility with the status quo ante: + this way your new user account will still not have the old primary group as new primary group, but will have it as auxiliary group. Consider reading through the [homectl(1)](https://www.freedesktop.org/software/systemd/man/homectl.html) - manual page at this point, maybe there are a couple of other settings you - want to set for your new account. In particular, look at `--storage=` and - `--disk-size=`, in order to change how your home directory shall be stored + manual page at this point, maybe there are a couple of other settings you want to set for your new account. + In particular, look at `--storage=` and `--disk-size=`, in order to change how your home directory shall be stored (the default `luks` storage is recommended). -5. Your new user account exists now, but it has an empty home directory. Let's - now migrate your old home directory into it. For that let's mount the new - home directory temporarily and copy the data in. +1. Your new user account exists now, but it has an empty home directory. + Let's now migrate your old home directory into it. + For that let's mount the new home directory temporarily and copy the data in. ``` homectl with foobar -- rsync -aHANUXv --remove-source-files /home/foobar.saved/ . ``` This mounts the home directory of the user, and then runs the specified - `rsync` command which copies the contents of the old home directory into the - new. The new home directory is the working directory of the invoked `rsync` - process. We are invoking this command as root, hence the `rsync` runs as - root too. When the `rsync` command completes the home directory is - automatically unmounted again. Since we used `--remove-source-files` all files - copied are removed from the old home directory as the copy progresses. After - the command completes the old home directory should be empty. Let's remove - it hence: + `rsync` command which copies the contents of the old home directory into the new. + The new home directory is the working directory of the invoked `rsync` process. + We are invoking this command as root, hence the `rsync` runs as root too. + When the `rsync` command completes the home directory is automatically unmounted again. + Since we used `--remove-source-files` all files copied are removed from the old home directory as the copy progresses. + After the command completes the old home directory should be empty. + Let's remove it hence: ``` rmdir /home/foobar.saved ``` -And that's it, we are done already. You can log out now and should be able to -log in under your user account as usual, but now with `systemd-homed` managing -your home directory. +And that's it, we are done already. +You can log out now and should be able to log in under your user account as usual, +but now with `systemd-homed` managing your home directory. diff --git a/docs/COREDUMP.md b/docs/COREDUMP.md index c64579e..d235479 100644 --- a/docs/COREDUMP.md +++ b/docs/COREDUMP.md @@ -10,36 +10,35 @@ SPDX-License-Identifier: LGPL-2.1-or-later ## Support in the Service Manager (PID 1) The systemd service manager natively provides coredump handling functionality, -as implemented by the Linux kernel. Specifically, PID 1 provides the following -functionality: +as implemented by the Linux kernel. +Specifically, PID 1 provides the following functionality: 1. During very early boot it will raise the [`LIMIT_CORE`](https://man7.org/linux/man-pages/man2/getrlimit.2.html) - resource limit for itself to infinity (and thus implicitly also all its - children). This removes any limits on the size of generated coredumps, for - all invoked processes, from earliest boot on. (The Linux kernel sets the - limit to 0 by default.) + resource limit for itself to infinity (and thus implicitly also all its children). + This removes any limits on the size of generated coredumps, + for all invoked processes, from earliest boot on. + (The Linux kernel sets the limit to 0 by default.) 2. At the same time it will turn off coredump handling in the kernel by writing `|/bin/false` into `/proc/sys/kernel/core_pattern` (also known as the "`kernel.core_pattern` sysctl"; see [core(5)](https://man7.org/linux/man-pages/man5/core.5.html) for - details). This means that coredumps are not actually processed. (The Linux - kernel sets the pattern to `core` by default, so that coredumps are written + details). + This means that coredumps are not actually processed. + (The Linux kernel sets the pattern to `core` by default, so that coredumps are written to the current working directory of the crashing process.) Net effect: after PID1 has started and performed this setup coredumps are disabled, but by means of the the `kernel.core_pattern` sysctl rather than by -size limit. This is generally preferable, since the pattern can be updated -trivially at the right time to enable coredumping once the system is ready, -taking comprehensive effect on all userspace. (Or to say this differently: -disabling coredumps via the size limit is problematic, since it cannot easily -be undone without iterating through all already running processes once the -system is ready for coredump handling.) +size limit. +This is generally preferable, since the pattern can be updated trivially at the right time to enable coredumping once the system is ready, taking comprehensive effect on all userspace. +(Or to say this differently: disabling coredumps via the size limit is problematic, since it cannot easily +be undone without iterating through all already running processes once the system is ready for coredump handling.) Processing of core dumps may be enabled at the appropriate time by updating the -`kernel.core_pattern` sysctl. Only coredumps that happen later will be -processed. +`kernel.core_pattern` sysctl. +Only coredumps that happen later will be processed. During the final shutdown phase the `kernel.core_pattern` sysctl is updated again to `|/bin/false`, disabling coredump support again, should it have been @@ -68,44 +67,46 @@ means the coredump handler runs for a very short time only, and the potentially specified service unit, and thus can take benefit of regular service resource management and sandboxing. -The `systemd-coredump` handler will extract a backtrace and [ELF packaging -metadata](https://systemd.io/ELF_PACKAGE_METADATA) from any coredumps it -receives and log both. The information about coredumps stored in the journal -can be enumerated and queried with the +The `systemd-coredump` handler will extract a backtrace and +[ELF packaging metadata](/ELF_PACKAGE_METADATA) from any coredumps it +receives and log both. +The information about coredumps stored in the journal can be enumerated and queried with the [`coredumpctl`](https://www.freedesktop.org/software/systemd/man/coredumpctl.html) tool, for example for directly invoking a debugger such as `gdb` on a collected coredump. -The handler writes coredump files to `/var/lib/systemd/coredump/`. Old files -are cleaned up periodically by +The handler writes coredump files to `/var/lib/systemd/coredump/`. +Old files are cleaned up periodically by [`systemd-tmpfiles(8)`](https://www.freedesktop.org/software/systemd/man/systemd-tmpfiles.html). ## User Experience With the above, any coredumps generated on the system are by default collected and turned into logged events — except during very early boot and late -shutdown. Individual services, processes or users can opt-out of coredump -collection, by setting `LIMIT_CORE` to 0 (or alternatively invoke -[`PR_SET_DUMPABLE`](https://man7.org/linux/man-pages/man2/prctl.2.html)). The -resource limit can be set freely by daemons/processes/users to arbitrary -values, which the coredump handler will respect. The `coredumpctl` tool may be -used to further analyze/debug coredumps. +shutdown. +Individual services, processes or users can opt-out of coredump collection, +by setting `LIMIT_CORE` to 0 (or alternatively invoke +[`PR_SET_DUMPABLE`](https://man7.org/linux/man-pages/man2/prctl.2.html)). +The resource limit can be set freely by daemons/processes/users to arbitrary +values, which the coredump handler will respect. +The `coredumpctl` tool may be used to further analyze/debug coredumps. ## Alternative Coredump Handlers While we recommend usage of the `systemd-coredump` handler, it's fully -supported to use alternative coredump handlers instead. A similar -implementation pattern is recommended. Specifically: +supported to use alternative coredump handlers instead. +A similar implementation pattern is recommended. +Specifically: -1. Use a `sysctl.d/` drop-in to register your handler with the kernel. Make - sure to include the `%c` specifier in the pattern (which reflects the - crashing process' `RLIMIT_CORE`) and act on it: limit the stored coredump - file to the specified limit. +1. Use a `sysctl.d/` drop-in to register your handler with the kernel. + Make sure to include the `%c` specifier in the pattern (which reflects the + crashing process' `RLIMIT_CORE`) and act on it: + limit the stored coredump file to the specified limit. -2. Do not do heavy processing directly in the coredump handler. Instead, - quickly pass off the kernel's coredump file descriptor to an - auxiliary service running as service under the service manager, so that it - can be done under supervision, sandboxing and resource management. +2. Do not do heavy processing directly in the coredump handler. + Instead, quickly pass off the kernel's coredump file descriptor to an + auxiliary service running as service under the service manager, + so that it can be done under supervision, sandboxing and resource management. Note that at any given time only a single handler can be enabled, i.e. the `kernel.core_pattern` sysctl cannot reference multiple executables. @@ -113,7 +114,8 @@ Note that at any given time only a single handler can be enabled, i.e. the ## Packaging It might make sense to split `systemd-coredump` into a separate distribution -package. If doing so, make sure that `/usr/lib/sysctl.d/50-coredump.conf` and +package. +If doing so, make sure that `/usr/lib/sysctl.d/50-coredump.conf` and the associated service and socket units are also added to the split off package. Note that in a scenario where `systemd-coredump` is split out and not @@ -125,8 +127,8 @@ to legacy style handling (see below). The default policy of the kernel to write coredumps into the current working directory of the crashing process is considered highly problematic by many, -including by the systemd maintainers. Nonetheless, if users locally want to -return to this behaviour, two changes must be made (followed by a reboot): +including by the systemd maintainers. +Nonetheless, if users locally want to return to this behaviour, two changes must be made (followed by a reboot): ```console $ mkdir -p /etc/sysctl.d diff --git a/docs/CREDENTIALS.md b/docs/CREDENTIALS.md index ed30eac..efa948b 100644 --- a/docs/CREDENTIALS.md +++ b/docs/CREDENTIALS.md @@ -59,7 +59,7 @@ purpose. Specifically, the following features are provided: 8. Credentials are an effective way to pass parameters into services that run with `RootImage=` or `RootDirectory=` and thus cannot read these resources directly from the host directory tree. - Specifically, [Portable Services](PORTABLE_SERVICES) may be + Specifically, [Portable Services](/PORTABLE_SERVICES) may be parameterized this way securely and robustly. 9. Credentials can be binary and relatively large (though currently an overall @@ -288,7 +288,7 @@ services where they are ultimately consumed. invokes. [`systemd-nspawn(1)`](https://www.freedesktop.org/software/systemd/man/systemd-nspawn.html#Credentials)'s `--set-credential=` and `--load-credential=` switches implement this, in order to pass arbitrary credentials from host to container payload. Also see - the [Container Interface](CONTAINER_INTERFACE) documentation. + the [Container Interface](/CONTAINER_INTERFACE) documentation. 2. Quite similar, VMs can be passed credentials via SMBIOS OEM strings (example qemu command line switch `-smbios diff --git a/docs/DAEMON_SOCKET_ACTIVATION.md b/docs/DAEMON_SOCKET_ACTIVATION.md index 1a027a3..70a3299 100644 --- a/docs/DAEMON_SOCKET_ACTIVATION.md +++ b/docs/DAEMON_SOCKET_ACTIVATION.md @@ -7,7 +7,8 @@ SPDX-License-Identifier: LGPL-2.1-or-later ## nginx -nginx includes an undocumented, internal socket-passing mechanism based on the `NGINX` environmental variable. It uses this to perform reloads without having to close and reopen its sockets, but it's also useful for socket activation. +nginx includes an undocumented, internal socket-passing mechanism based on the `NGINX` environmental variable. +It uses this to perform reloads without having to close and reopen its sockets, but it's also useful for socket activation. **/etc/nginx/my-nginx.conf** @@ -31,27 +32,29 @@ ExecStart=/usr/sbin/nginx -c/etc/nginx/my-nginx.conf PrivateNetwork=true ``` - **/etc/systemd/system/my-nginx.socket** ``` +[Unit] +After=network.target +Requires=network.target + [Socket] ListenStream=80 ListenStream=0.0.0.0:80 BindIPv6Only=ipv6-only -After=network.target -Requires=network.target [Install] WantedBy=sockets.target ``` - ## PHP-FPM -Like nginx, PHP-FPM includes a socket-passing mechanism an environmental variable. In PHP-FPM's case, it's `FPM_SOCKETS`. +Like nginx, PHP-FPM includes a socket-passing mechanism an environmental variable. +In PHP-FPM's case, it's `FPM_SOCKETS`. -This configuration is possible with any web server that supports FastCGI (like Apache, Lighttpd, or nginx). The web server does not need to know anything special about the socket; use a normal PHP-FPM configuration. +This configuration is possible with any web server that supports FastCGI (like Apache, Lighttpd, or nginx). +The web server does not need to know anything special about the socket; use a normal PHP-FPM configuration. Paths are based on a Fedora 19 system. @@ -74,7 +77,6 @@ pm.max_children = 10 slowlog = syslog ``` - **/etc/systemd/system/my-php-fpm-pool.service** ``` @@ -86,7 +88,6 @@ ExecStart=/usr/sbin/php-fpm --fpm-config=/etc/php-fpm.d/my-php-fpm-pool.conf KillMode=process ``` - **/etc/systemd/system/my-php-fpm-pool.socket** ``` @@ -97,7 +98,6 @@ ListenStream=/var/run/my-php-fpm-pool.socket WantedBy=sockets.target ``` - ### Second, the setup commands ```sh @@ -106,15 +106,14 @@ sudo systemctl start my-php-fpm-pool.socket sudo systemctl enable my-php-fpm-pool.socket ``` - After accessing the web server, the service should be running. ```sh sudo systemctl status my-php-fpm-pool.service ``` - -It's possible to shut down the service and re-activate it using the web browser, too. It's necessary to stop and start the socket to reset some shutdown PHP-FPM does that otherwise breaks reactivation. +It's possible to shut down the service and re-activate it using the web browser, too. +It's necessary to stop and start the socket to reset some shutdown PHP-FPM does that otherwise breaks reactivation. ```sh sudo systemctl stop my-php-fpm-pool.socket my-php-fpm-pool.service diff --git a/docs/DEBUGGING.md b/docs/DEBUGGING.md index dc1c874..3e89a5d 100644 --- a/docs/DEBUGGING.md +++ b/docs/DEBUGGING.md @@ -9,36 +9,48 @@ SPDX-License-Identifier: LGPL-2.1-or-later If your machine gets stuck during boot, first check if the hang happens before or after control passes to systemd. -Try to boot without `rhgb` and `quiet` on the kernel command line. If you see some messages like these: +Try to boot without `rhgb` and `quiet` on the kernel command line. +If you see some messages like these: * Welcome to Fedora _VERSION_ (_codename_)!" * Starting _name_... * \[ OK \] Started _name_. -then systemd is running. (See an actual [screenshot](f17boot.png).) +then systemd is running. +(See an actual [screenshot](../assets/f17boot.png).) -Debugging always gets easier if you can get a shell. If you do not get a login prompt, try switching to a different virtual terminal using CTRL+ALT+F\_\_. Problems with a display server startup may manifest themselves as a missing login on tty1, but other VTs working. +Debugging always gets easier if you can get a shell. +If you do not get a login prompt, try switching to a different virtual terminal using CTRL+ALT+F\_\_. +Problems with a display server startup may manifest themselves as a missing login on tty1, but other VTs working. -If the boot stops without presenting you with a login on any virtual console, let it retry for _up to 5 minutes_ before declaring it definitely stuck. There is a chance that a service that has trouble starting will be killed after this timeout and the boot will continue normally. Another possibility is that a device for an important mountpoint will fail to appear and you will be presented with _emergency mode_. +If the boot stops without presenting you with a login on any virtual console, +let it retry for _up to 5 minutes_ before declaring it definitely stuck. +There is a chance that a service that has trouble starting will be killed after this timeout and the boot will continue normally. +Another possibility is that a device for an important mountpoint will fail to appear and you will be presented with _emergency mode_. ## If You Get No Shell -If you get neither a normal login nor the emergency mode shell, you will need to do additional steps to get debugging information out of the machine. +If you get neither a normal login nor the emergency mode shell, +you will need to do additional steps to get debugging information out of the machine. * Try CTRL+ALT+DEL to reboot. - * If it does not reboot, mention it in your bugreport. Meanwhile force the reboot with [SysRq](http://fedoraproject.org/wiki/QA/Sysrq) or hard reset. + * If it does not reboot, mention it in your bugreport. Meanwhile force the reboot with + [SysRq](http://fedoraproject.org/wiki/QA/Sysrq) + or hard reset. * When booting the next time, you will have to add some kernel command line arguments depending on which of the debugging strategies you choose from the following options. ### Debug Logging to a Serial Console -If you have a hardware serial console available or if you are debugging in a virtual machine (e.g. using virt-manager you can switch your view to a serial console in the menu View -> Text Consoles or connect from the terminal using `virsh console MACHINE`), you can ask systemd to log lots of useful debugging information to it by booting with: +If you have a hardware serial console available or if you are debugging in a virtual machine +(e.g. using virt-manager you can switch your view to a serial console in the menu View -> Text Consoles or connect from the terminal using `virsh console MACHINE`), +you can ask systemd to log lots of useful debugging information to it by booting with: ```sh systemd.log_level=debug systemd.log_target=console console=ttyS0,38400 console=tty1 ``` - -The above is useful if pid 1 is failing, but if a later but critical boot service is broken (such as networking), you can configure journald to forward to the console by using: +The above is useful if pid 1 is failing, but if a later but critical boot service is broken (such as networking), +you can configure journald to forward to the console by using: ```sh systemd.journald.forward_to_console=1 console=ttyS0,38400 console=tty1 @@ -48,23 +60,31 @@ console= can be specified multiple times, systemd will output to all of them. ### Booting into Rescue or Emergency Targets -To boot directly into rescue target add `systemd.unit=rescue.target` or just `1` to the kernel command line. This target is useful if the problem occurs somewhere after the basic system is brought up, during the starting of "normal" services. If this is the case, you should be able to disable the bad service from here. If the rescue target will not boot either, the more minimal emergency target might. +To boot directly into rescue target add `systemd.unit=rescue.target` or just `1` to the kernel command line. +This target is useful if the problem occurs somewhere after the basic system is brought up, during the starting of "normal" services. +If this is the case, you should be able to disable the bad service from here. +If the rescue target will not boot either, the more minimal emergency target might. -To boot directly into emergency shell add `systemd.unit=emergency.target` or `emergency` to the kernel command line. Note that in the emergency shell you will have to remount the root filesystem read-write by yourself before editing any files: +To boot directly into emergency shell add `systemd.unit=emergency.target` or `emergency` to the kernel command line. +Note that in the emergency shell you will have to remount the root filesystem read-write by yourself before editing any files: ```sh mount -o remount,rw / ``` -Common issues that can be resolved in the emergency shell are bad lines in **/etc/fstab**. After fixing **/etc/fstab**, run `systemctl daemon-reload` to let systemd refresh its view of it. +Common issues that can be resolved in the emergency shell are bad lines in `/etc/fstab`. +After fixing **/etc/fstab**, run `systemctl daemon-reload` to let systemd refresh its view of it. -If not even the emergency target works, you can boot directly into a shell with `init=/bin/sh`. This may be necessary in case systemd itself or some libraries it depends on are damaged by filesystem corruption. You may need to reinstall working versions of the affected packages. +If not even the emergency target works, you can boot directly into a shell with `init=/bin/sh`. +This may be necessary in case systemd itself or some libraries it depends on are damaged by filesystem corruption. +You may need to reinstall working versions of the affected packages. If `init=/bin/sh` does not work, you must boot from another medium. ### Early Debug Shell -You can enable shell access to be available very early in the startup process to fall back on and diagnose systemd related boot up issues with various systemctl commands. Enable it using: +You can enable shell access to be available very early in the startup process to fall back on and diagnose systemd related boot up issues with various systemctl commands. +Enable it using: ```sh systemctl enable debug-shell.service @@ -78,31 +98,46 @@ systemd.debug-shell=1 on the kernel command line. -**Tip**: If you find yourself in a situation where you cannot use systemctl to communicate with a running systemd (e.g. when setting this up from a different booted system), you can avoid communication with the manager by specifying `--root=`: +**Tip**: If you find yourself in a situation where you cannot use systemctl to communicate with a running systemd +(e.g. when setting this up from a different booted system), +you can avoid communication with the manager by specifying `--root=`: ```sh systemctl --root=/ enable debug-shell.service ``` -Once enabled, the next time you boot you will be able to switch to tty9 using CTRL+ALT+F9 and have a root shell there available from an early point in the booting process. You can use the shell for checking the status of services, reading logs, looking for stuck jobs with `systemctl list-jobs`, etc. +Once enabled, the next time you boot you will be able to switch to tty9 using CTRL+ALT+F9 and have a root shell there available from an early point in the booting process. +You can use the shell for checking the status of services, reading logs, looking for stuck jobs with `systemctl list-jobs`, etc. -**Warning:** Use this shell only for debugging! Do not forget to disable systemd-debug-shell.service after you've finished debugging your boot problems. Leaving the root shell always available would be a security risk. +**Warning:** Use this shell only for debugging! +Do not forget to disable systemd-debug-shell.service after you've finished debugging your boot problems. +Leaving the root shell always available would be a security risk. -It is also possible to alias `kbrequest.target` to `debug-shell.service` to start the debug shell on demand. This has the same security implications, but avoids running the shell always. +It is also possible to alias `kbrequest.target` to `debug-shell.service` to start the debug shell on demand. +This has the same security implications, but avoids running the shell always. ### verify prerequisites -A (at least partly) populated `/dev` is required. Depending on your setup (e.g. on embedded systems), check that the Linux kernel config options `CONFIG_DEVTMPFS` and `CONFIG_DEVTMPFS_MOUNT` are set. Also support for cgroups and fanotify is recommended for a flawless operation, so check that the Linux kernel config options `CONFIG_CGROUPS` and `CONFIG_FANOTIFY` are set. The message "Failed to get D-Bus connection: No connection to service manager." during various `systemctl` operations is an indicator that these are missing. +A (at least partly) populated `/dev` is required. +Depending on your setup (e.g. on embedded systems), +check that the Linux kernel config options `CONFIG_DEVTMPFS` and `CONFIG_DEVTMPFS_MOUNT` are set. +Also support for cgroups and fanotify is recommended for a flawless operation, so check that the Linux kernel config options `CONFIG_CGROUPS` and `CONFIG_FANOTIFY` are set. +The message "Failed to get D-Bus connection: No connection to service manager." +during various `systemctl` operations is an indicator that these are missing. ## If You Can Get a Shell -When you have systemd running to the extent that it can provide you with a shell, please use it to extract useful information for debugging. Boot with these parameters on the kernel command line: +When you have systemd running to the extent that it can provide you with a shell, +please use it to extract useful information for debugging. +Boot with these parameters on the kernel command line: ```sh systemd.log_level=debug systemd.log_target=kmsg log_buf_len=1M printk.devkmsg=on ``` -in order to increase the verbosity of systemd, to let systemd write its logs to the kernel log buffer, to increase the size of the kernel log buffer, and to prevent the kernel from discarding messages. After reaching the shell, look at the log: +in order to increase the verbosity of systemd, to let systemd write its logs to the kernel log buffer, +to increase the size of the kernel log buffer, and to prevent the kernel from discarding messages. +After reaching the shell, look at the log: ```sh journalctl -b @@ -118,12 +153,14 @@ systemctl list-jobs The jobs that are listed as "running" are the ones that must complete before the "waiting" ones will be allowed to start executing. - # Diagnosing Shutdown Problems -Just like with boot problems, when you encounter a hang during shutting down, make sure you wait _at least 5 minutes_ to distinguish a permanent hang from a broken service that's just timing out. Then it's worth testing whether the system reacts to CTRL+ALT+DEL in any way. +Just like with boot problems, when you encounter a hang during shutting down, +make sure you wait _at least 5 minutes_ to distinguish a permanent hang from a broken service that's just timing out. +Then it's worth testing whether the system reacts to CTRL+ALT+DEL in any way. -If shutdown (whether it be to reboot or power-off) of your system gets stuck, first test if the kernel itself is able to reboot or power-off the machine forcedly using one of these commands: +If shutdown (whether it be to reboot or power-off) of your system gets stuck, +first test if the kernel itself is able to reboot or power-off the machine forcedly using one of these commands: ```sh reboot -f @@ -142,7 +179,7 @@ If normal reboot or poweroff work, but take a suspiciously long time, then systemd.log_level=debug systemd.log_target=kmsg log_buf_len=1M printk.devkmsg=on enforcing=0 ``` -* save the following script as **/usr/lib/systemd/system-shutdown/debug.sh** and make it executable: +* save the following script as `/usr/lib/systemd/system-shutdown/debug.sh` and make it executable: ```sh #!/bin/sh @@ -153,17 +190,17 @@ mount -o remount,ro / * reboot - -Look for timeouts logged in the resulting file **shutdown-log.txt** and/or attach it to a bugreport. +Look for timeouts logged in the resulting file `shutdown-log.txt` and/or attach it to a bugreport. ## Shutdown Never Finishes -If normal reboot or poweroff never finish even after waiting a few minutes, the above method to create the shutdown log will not help and the log must be obtained using other methods. Two options that are useful for debugging boot problems can be used also for shutdown problems: +If normal reboot or poweroff never finish even after waiting a few minutes, +the above method to create the shutdown log will not help and the log must be obtained using other methods. +Two options that are useful for debugging boot problems can be used also for shutdown problems: * use a serial console * use a debug shell - not only is it available from early boot, it also stays active until late shutdown. - # Status and Logs of Services When the start of a service fails, systemctl will give you a generic error message: @@ -173,38 +210,52 @@ When the start of a service fails, systemctl will give you a generic error messa Job failed. See system journal and 'systemctl status' for details. ``` -The service may have printed its own error message, but you do not see it, because services run by systemd are not related to your login session and their outputs are not connected to your terminal. That does not mean the output is lost though. By default the stdout, stderr of services are directed to the systemd _journal_ and the logs that services produce via `syslog(3)` go there too. systemd also stores the exit code of failed services. Let's check: +The service may have printed its own error message, but you do not see it, +because services run by systemd are not related to your login session and their outputs are not connected to your terminal. +That does not mean the output is lost though. +By default the stdout, +stderr of services are directed to the systemd _journal_ and the logs that services produce via `syslog(3)` go there too. +systemd also stores the exit code of failed services. +Let's check: ```sh # systemctl status foo.service foo.service - mmm service - Loaded: loaded (/etc/systemd/system/foo.service; static) - Active: failed (Result: exit-code) since Fri, 11 May 2012 20:26:23 +0200; 4s ago - Process: 1329 ExecStart=/usr/local/bin/foo (code=exited, status=1/FAILURE) - CGroup: name=systemd:/system/foo.service +Loaded: loaded (/etc/systemd/system/foo.service; static) +Active: failed (Result: exit-code) since Fri, 11 May 2012 20:26:23 +0200; 4s ago +Process: 1329 ExecStart=/usr/local/bin/foo (code=exited, status=1/FAILURE) +CGroup: name=systemd:/system/foo.service May 11 20:26:23 scratch foo[1329]: Failed to parse config ``` - -In this example the service ran as a process with PID 1329 and exited with error code 1. If you run systemctl status as root or as a user from the `adm` group, you will get a few lines from the journal that the service wrote. In the example the service produced just one error message. +In this example the service ran as a process with PID 1329 and exited with error code 1. +If you run systemctl status as root or as a user from the `adm` group, +you will get a few lines from the journal that the service wrote. +In the example the service produced just one error message. To list the journal, use the `journalctl` command. -If you have a syslog service (such as rsyslog) running, the journal will also forward the messages to it, so you'll find them in **/var/log/messages** (depending on rsyslog's configuration). - +If you have a syslog service (such as rsyslog) running, the journal will also forward the messages to it, +so you'll find them in `/var/log/messages` (depending on rsyslog's configuration). # Reporting systemd Bugs -Be prepared to include some information (logs) about your system as well. These should be complete (no snippets please), not in an archive, uncompressed. +Be prepared to include some information (logs) about your system as well. +These should be complete (no snippets please), not in an archive, uncompressed. -Please report bugs to your distribution's bug tracker first. If you are sure that you are encountering an upstream bug, then first check [for existing bug reports](https://github.com/systemd/systemd/issues/), and if your issue is not listed [file a new bug](https://github.com/systemd/systemd/issues/new). +Please report bugs to your distribution's bug tracker first. +If you are sure that you are encountering an upstream bug, then first check +[for existing bug reports](https://github.com/systemd/systemd/issues/), +and if your issue is not listed +[file a new bug](https://github.com/systemd/systemd/issues/new). ## Information to Attach to a Bug Report Whenever possible, the following should be mentioned and attached to your bug report: -* The exact kernel command-line used. Typically from the bootloader configuration file (e.g. **/boot/grub2/grub.cfg**) or from **/proc/cmdline** +* The exact kernel command-line used. +Typically from the bootloader configuration file (e.g. `/boot/grub2/grub.cfg`) or from `/proc/cmdline` * The journal (the output of `journalctl -b > journal.txt`) * ideally after booting with `systemd.log_level=debug systemd.log_target=kmsg log_buf_len=1M printk.devkmsg=on` * The output of a systemd dump: `systemd-analyze dump > systemd-dump.txt` diff --git a/docs/DESKTOP_ENVIRONMENTS.md b/docs/DESKTOP_ENVIRONMENTS.md index 0a0eff6..1c72969 100644 --- a/docs/DESKTOP_ENVIRONMENTS.md +++ b/docs/DESKTOP_ENVIRONMENTS.md @@ -59,21 +59,27 @@ desktop environments should adhere to the following conventions: - `app-KDE-org.kde.okular@12345.service` - `app-org.kde.amarok.service` - `app-org.gnome.Evince-12345.scope` + * Using `.service` units instead of `.scope` units, i.e. allowing systemd to start the process on behalf of the caller, instead of the caller starting the process and letting systemd know about it, is encouraged. + * The RANDOM should be a string of random characters to ensure that multiple instances of the application can be launched. + It can be omitted in the case of a non-transient application services which can ensure multiple instances are not spawned, such as a DBus activated application. + * If no application ID is available, the launcher should generate a reasonable name when possible (e.g. using `basename(argv[0])`). This name must not contain a `-` character. This has the following advantages: + * Using the `app--` prefix means that the unit defaults can be adjusted using desktop environment specific drop-in files. + * The application ID can be retrieved by stripping the prefix and postfix. This in turn should map to the corresponding `.desktop` file when available @@ -82,8 +88,8 @@ This could be `app---.slice`. TODO: Does it really make sense to insert the ``? In GNOME I am currently using a drop-in to configure `BindTo=graphical-session.target`, -`CollectMode=inactive-or-failed` and `TimeoutSec=5s`. I feel that such a -policy makes sense, but it may make much more sense to just define a +`CollectMode=inactive-or-failed` and `TimeoutSec=5s`. +I feel that such a policy makes sense, but it may make much more sense to just define a global default for all (graphical) applications. * Should application lifetime be bound to the session? @@ -95,19 +101,17 @@ global default for all (graphical) applications. To allow XDG autostart integration, systemd ships a cross-desktop generator to create appropriate units for the autostart directory (`systemd-xdg-autostart-generator`). -Desktop Environments can opt-in to using this by starting -`xdg-desktop-autostart.target`. The systemd generator correctly handles -`OnlyShowIn=` and `NotShowIn=`. It also handles the KDE and GNOME specific -`X-KDE-autostart-condition=` and `AutostartCondition=` by using desktop-environment-provided -binaries in an `ExecCondition=` line. - -However, this generator is somewhat limited in what it supports. For example, -all generated units will have `After=graphical-session.target` set on them, +Desktop Environments can opt-in to using this by starting `xdg-desktop-autostart.target`. +The systemd generator correctly handles `OnlyShowIn=` and `NotShowIn=`. +It also handles the KDE and GNOME specific `X-KDE-autostart-condition=` and `AutostartCondition=` by using desktop-environment-provided binaries in an `ExecCondition=` line. + +However, this generator is somewhat limited in what it supports. +For example, all generated units will have `After=graphical-session.target` set on them, and therefore may not be useful to start session services. Desktop files can be marked to be explicitly excluded from the generator using the line -`X-systemd-skip=true`. This should be set if an application provides its own -systemd service file for startup. +`X-systemd-skip=true`. +This should be set if an application provides its own systemd service file for startup. ## Startup and shutdown best practices diff --git a/docs/DISTRO_PORTING.md b/docs/DISTRO_PORTING.md index c95a829..cb23093 100644 --- a/docs/DISTRO_PORTING.md +++ b/docs/DISTRO_PORTING.md @@ -9,8 +9,7 @@ SPDX-License-Identifier: LGPL-2.1-or-later ## HOWTO -You need to make the follow changes to adapt systemd to your -distribution: +You need to make the follow changes to adapt systemd to your distribution: 1. Find the right configure parameters for: @@ -27,23 +26,22 @@ distribution: 2. Try it out. Play around (as an ordinary user) with - `/usr/lib/systemd/systemd --test --system` for a test run - of systemd without booting. This will read the unit files and - print the initial transaction it would execute during boot-up. + `/usr/lib/systemd/systemd --test --system` for a test run of systemd without booting. + This will read the unit files and print the initial transaction it would execute during boot-up. This will also inform you about ordering loops and suchlike. ## Compilation options -The default configuration does not enable any optimization or hardening -options. This is suitable for development and testing, but not for end-user +The default configuration does not enable any optimization or hardening options. +This is suitable for development and testing, but not for end-user installations. For deployment, optimization (`-O2` or `-O3` compiler options), link time optimization (`-Db_lto=true` meson option), and hardening (e.g. `-D_FORTIFY_SOURCE=2`, `-fstack-protector-strong`, `-fstack-clash-protection`, `-fcf-protection`, `-pie` compiler options, and `-z relro`, `-z now`, -`--as-needed` linker options) are recommended. The most appropriate set of -options depends on the architecture and distribution specifics so no default is +`--as-needed` linker options) are recommended. +The most appropriate set of options depends on the architecture and distribution specifics so no default is provided. ## NTP Pool @@ -56,8 +54,9 @@ and can be up to .5s off from servers that use stepped leap seconds. If you prefer to use leap second steps, please register your own vendor pool at ntp.org and make it the built-in default by -passing `-Dntp-servers=` to meson. Registering vendor -pools is [free](http://www.pool.ntp.org/en/vendors.html). +passing `-Dntp-servers=` to meson. +Registering vendor pools is +[free](http://www.pool.ntp.org/en/vendors.html). Use `-Dntp-servers=` to direct systemd-timesyncd to different fallback NTP servers. @@ -75,8 +74,8 @@ DNS servers. The default PAM config shipped by systemd is really bare bones. It does not include many modules your distro might want to enable -to provide a more seamless experience. For example, limits set in -`/etc/security/limits.conf` will not be read unless you load `pam_limits`. +to provide a more seamless experience. +For example, limits set in `/etc/security/limits.conf` will not be read unless you load `pam_limits`. Make sure you add modules your distro expects from user services. Pass `-Dpamconfdir=no` to meson to avoid installing this file and @@ -85,10 +84,9 @@ instead install your own. ## Contributing Upstream We generally no longer accept distribution-specific patches to -systemd upstream. If you have to make changes to systemd's source code -to make it work on your distribution, unless your code is generic -enough to be generally useful, we are unlikely to merge it. Please -always consider adopting the upstream defaults. If that is not -possible, please maintain the relevant patches downstream. +systemd upstream. +If you have to make changes to systemd's source code to make it work on your distribution, unless your code is generic enough to be generally useful, we are unlikely to merge it. +Please always consider adopting the upstream defaults. +If that is not possible, please maintain the relevant patches downstream. Thank you for understanding. diff --git a/docs/ELF_PACKAGE_METADATA.md b/docs/ELF_PACKAGE_METADATA.md index 6cb3f78..176f574 100644 --- a/docs/ELF_PACKAGE_METADATA.md +++ b/docs/ELF_PACKAGE_METADATA.md @@ -12,8 +12,8 @@ or parse ELF core files.* ## Motivation -ELF binaries get stamped with a unique, build-time generated hex string identifier called -`build-id`, [which gets embedded as an ELF note called `.note.gnu.build-id`](https://fedoraproject.org/wiki/Releases/FeatureBuildId). +ELF binaries get stamped with a unique, build-time generated hex string identifier called `build-id`, +[which gets embedded as an ELF note called `.note.gnu.build-id`](https://fedoraproject.org/wiki/Releases/FeatureBuildId). In most cases, this allows to associate a stripped binary with its debugging information. It is used, for example, to dynamically fetch DWARF symbols from a debuginfo server, or to query the local package manager and find out the package metadata or, again, the DWARF diff --git a/docs/FAQ.md b/docs/FAQ.md index 483645b..3b03726 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -7,7 +7,7 @@ SPDX-License-Identifier: LGPL-2.1-or-later # Frequently Asked Questions -Also check out the [Tips & Tricks](../TIPS_AND_TRICKS)! +Also check out the [Tips & Tricks](/TIPS_AND_TRICKS)! **Q: How do I change the current runlevel?** @@ -70,7 +70,8 @@ A: Simply instantiate a new getty service for the port of your choice (internall # systemctl start serial-getty@ttyS2.service ``` -Note that gettys on the virtual console are started on demand. You can control how many you get via the NAutoVTs= setting in [logind.conf(7)](http://www.freedesktop.org/software/systemd/man/logind.html). Also see [this blog story](http://0pointer.de/blog/projects/serial-console.html). +Note that gettys on the virtual console are started on demand. You can control how many you get via the NAutoVTs= setting in [logind.conf(7)](http://www.freedesktop.org/software/systemd/man/systemd-logind.service). +Also see [this blog story](http://0pointer.de/blog/projects/serial-console.html). **Q: How to I figure out which service a process belongs to?** @@ -103,12 +104,12 @@ A: Use: **Q: Whenever my service tries to acquire RT scheduling for one of its threads this is refused with EPERM even though my service is running with full privileges. This works fine on my non-systemd system!** -A: By default, systemd places all systemd daemons in their own cgroup in the "cpu" hierarchy. Unfortunately, due to a kernel limitation, this has the effect of disallowing RT entirely for the service. See [My Service Can't Get Realtime!](../MY_SERVICE_CANT_GET_REATLIME) for a longer discussion and what to do about this. +A: By default, systemd places all systemd daemons in their own cgroup in the "cpu" hierarchy. Unfortunately, due to a kernel limitation, this has the effect of disallowing RT entirely for the service. See [My Service Can't Get Realtime!](/MY_SERVICE_CANT_GET_REATLIME) for a longer discussion and what to do about this. **Q: My service is ordered after `network.target` but at boot it is still called before the network is up. What's going on?** -A: That's a long story, and that's why we have a wiki page of its own about this: [Running Services After the Network is up](../NETWORK_ONLINE) +A: That's a long story, and that's why we have a wiki page of its own about this: [Running Services After the Network is up](/NETWORK_ONLINE) **Q: My systemd system always comes up with `/tmp` as a tiny `tmpfs`. How do I get rid of this?** -A: That's also a long story, please have a look on [API File Systems](../API_FILE_SYSTEMS) +A: That's also a long story, please have a look on [API File Systems](/API_FILE_SYSTEMS) diff --git a/docs/GROUP_RECORD.md b/docs/GROUP_RECORD.md index f463b0a..c055e49 100644 --- a/docs/GROUP_RECORD.md +++ b/docs/GROUP_RECORD.md @@ -8,23 +8,23 @@ SPDX-License-Identifier: LGPL-2.1-or-later # JSON Group Records Long story short: JSON Group Records are to `struct group` what -[JSON User Records](USER_RECORD) are to `struct passwd`. +[JSON User Records](/USER_RECORD) are to `struct passwd`. -Conceptually, much of what applies to JSON user records also applies to JSON -group records. They also consist of seven sections, with similar properties and +Conceptually, much of what applies to JSON user records also applies to JSON group records. +They also consist of seven sections, with similar properties and they carry some identical (or at least very similar) fields. ## Fields in the `regular` section -`groupName` → A string with the UNIX group name. Matches the `gr_name` field of -UNIX/glibc NSS `struct group`, or the shadow structure `struct sgrp`'s -`sg_namp` field. +`groupName` → A string with the UNIX group name. +Matches the `gr_name` field of UNIX/glibc NSS `struct group`, +or the shadow structure `struct sgrp`'s `sg_namp` field. -`realm` → The "realm" the group belongs to, conceptually identical to the same -field of user records. A string in DNS domain name syntax. +`realm` → The "realm" the group belongs to, conceptually identical to the same field of user records. +A string in DNS domain name syntax. -`description` → A descriptive string for the group. This is similar to the -`realName` field of user records, and accepts arbitrary strings, as long as +`description` → A descriptive string for the group. +This is similar to the `realName` field of user records, and accepts arbitrary strings, as long as they follow the same GECOS syntax requirements as `realName`. `disposition` → The disposition of the group, conceptually identical to the @@ -33,39 +33,36 @@ same field of user records. A string. `service` → A string, an identifier for the service managing this group record (this field is typically in reverse domain name syntax.) -`lastChangeUSec` → An unsigned 64-bit integer, a timestamp (in µs since the UNIX -epoch 1970) of the last time the group record has been modified. (Covers only -the `regular`, `perMachine` and `privileged` sections). +`lastChangeUSec` → An unsigned 64-bit integer, a timestamp +(in µs since the UNIX epoch 1970) of the last time the group record has been modified. +(Covers only the `regular`, `perMachine` and `privileged` sections). -`gid` → An unsigned integer in the range 0…4294967295: the numeric UNIX group -ID (GID) to use for the group. This corresponds to the `gr_gid` field of -`struct group`. +`gid` → An unsigned integer in the range 0…4294967295: the numeric UNIX group ID (GID) to use for the group. +This corresponds to the `gr_gid` field of `struct group`. -`members` → An array of strings, listing user names that are members of this -group. Note that JSON user records also contain a `memberOf` field, or in other +`members` → An array of strings, listing user names that are members of this group. +Note that JSON user records also contain a `memberOf` field, or in other words a group membership can either be denoted in the JSON user record or in -the JSON group record, or in both. The list of memberships should be determined -as the combination of both lists (plus optionally others). If a user is listed -as member of a group and doesn't exist it should be ignored. This field -corresponds to the `gr_mem` field of `struct group` and the `sg_mem` field of -`struct sgrp`. +the JSON group record, or in both. -`administrators` → Similarly, an array of strings, listing user names that -shall be considered "administrators" of this group. This field corresponds to -the `sg_adm` field of `struct sgrp`. +The list of memberships should be determined as the combination of both lists (plus optionally others). +If a user is listed as member of a group and doesn't exist it should be ignored. +This field corresponds to the `gr_mem` field of `struct group` and the `sg_mem` field of `struct sgrp`. + +`administrators` → Similarly, an array of strings, listing user names that shall be considered "administrators" of this group. +This field corresponds to the `sg_adm` field of `struct sgrp`. `privileged`/`perMachine`/`binding`/`status`/`signature`/`secret` → The -objects/arrays for the other six group record sections. These are organized the -same way as for the JSON user records, and have the same semantics. +objects/arrays for the other six group record sections. +These are organized the same way as for the JSON user records, and have the same semantics. ## Fields in the `privileged` section The following fields are defined: -`hashedPassword` → An array of strings with UNIX hashed passwords; see the -matching field for user records for details. This field corresponds to the -`sg_passwd` field of `struct sgrp` (and `gr_passwd` of `struct group` in a -way). +`hashedPassword` → An array of strings with UNIX hashed passwords; +see the matching field for user records for details. +This field corresponds to the `sg_passwd` field of `struct sgrp` (and `gr_passwd` of `struct group` in a way). ## Fields in the `perMachine` section diff --git a/docs/HACKING.md b/docs/HACKING.md index aea25db..45334d8 100644 --- a/docs/HACKING.md +++ b/docs/HACKING.md @@ -11,8 +11,8 @@ We welcome all contributions to systemd. If you notice a bug or a missing feature, please feel invited to fix it, and submit your work as a [GitHub Pull Request (PR)](https://github.com/systemd/systemd/pull/new). -Please make sure to follow our [Coding Style](CODING_STYLE) when submitting -patches. Also have a look at our [Contribution Guidelines](CONTRIBUTING). +Please make sure to follow our [Coding Style](/CODING_STYLE) when submitting +patches. Also have a look at our [Contribution Guidelines](/CONTRIBUTING). When adding new functionality, tests should be added. For shared functionality (in `src/basic/` and `src/shared/`) unit tests should be sufficient. The general @@ -23,7 +23,7 @@ test executable. For features at a higher level, tests in `src/test/` are very strongly recommended. If that is not possible, integration tests in `test/` are encouraged. -Please also have a look at our list of [code quality tools](CODE_QUALITY) we +Please also have a look at our list of [code quality tools](/CODE_QUALITY) we have setup for systemd, to ensure our codebase stays in good shape. Please always test your work before submitting a PR. For many of the components @@ -117,7 +117,7 @@ Some source files are generated during build. We use two templating engines: where jinja2 syntax is not interpreted. See the - [Jinja Template Designer Documentation](https://jinja2docs.readthedocs.io/en/stable/templates.html#synopsis) + [Jinja Template Designer Documentation](https://jinja.palletsprojects.com/en/3.1.x/templates/#synopsis) for details. Please note that files for both template engines use the `.in` extension. @@ -131,7 +131,7 @@ distribution and can be disabled by setting `-Dmode=release`. ## Sanitizers in mkosi -See [Testing systemd using sanitizers](TESTING_WITH_SANITIZERS) for more information +See [Testing systemd using sanitizers](/TESTING_WITH_SANITIZERS) for more information on how to build with sanitizers enabled in mkosi. ## Fuzzers @@ -194,7 +194,7 @@ done ``` If you find a bug that impacts the security of systemd, please follow the -guidance in [CONTRIBUTING.md](CONTRIBUTING) on how to report a security vulnerability. +guidance in [CONTRIBUTING.md](/CONTRIBUTING) on how to report a security vulnerability. For more details on building fuzzers and integrating with OSS-Fuzz, visit: diff --git a/docs/HOME_DIRECTORY.md b/docs/HOME_DIRECTORY.md index f1b7faf..2efabae 100644 --- a/docs/HOME_DIRECTORY.md +++ b/docs/HOME_DIRECTORY.md @@ -8,24 +8,24 @@ SPDX-License-Identifier: LGPL-2.1-or-later # Home Directories [`systemd-homed.service(8)`](https://www.freedesktop.org/software/systemd/man/systemd-homed.service.html) -manages home directories of regular ("human") users. Each directory it manages -encapsulates both the data store and the user record of the user, so that it -comprehensively describes the user account, and is thus naturally portable -between systems without any further, external metadata. This document describes -the format used by these home directories, in the context of the storage +manages home directories of regular ("human") users. +Each directory it manages encapsulates both the data store and the user record of the user, +so that it comprehensively describes the user account, and is thus naturally portable +between systems without any further, external metadata. +This document describes the format used by these home directories, in the context of the storage mechanism used. ## General Structure Inside of the home directory a file `~/.identity` contains the JSON formatted -user record of the user. It follows the format defined in -[`JSON User Records`](USER_RECORD). It is recommended to bring the -record into 'normalized' form (i.e. all objects should contain their fields -sorted alphabetically by their key) before storing it there, though this is not -required nor enforced. Since the user record is cryptographically signed, the -user cannot make modifications to the file on their own (at least not without -corrupting it, or knowing the private key used for signing the record). Note -that user records are stored here without their `binding`, `status` and +user record of the user. +It follows the format defined in [`JSON User Records`](/USER_RECORD). +It is recommended to bring the record into 'normalized' form(i.e. all objects should contain their fields +sorted alphabetically by their key) before storing it there, +though this is not required nor enforced. +Since the user record is cryptographically signed, the user cannot make modifications to the file on their own +(at least not without corrupting it, or knowing the private key used for signing the record). +Note that user records are stored here without their `binding`, `status` and `secret` sections, i.e. only with the sections included in the signature plus the signature section itself. @@ -39,127 +39,119 @@ command line) the home directory requires no special setup besides including the user record in the `~/.identity` file. It is recommended to name home directories managed this way by -`systemd-homed.service` by the user name, suffixed with `.homedir` (example: -`lennart.homedir` for a user `lennart`) but this is not enforced. When the user -is logged in, the directory is generally mounted to `/home/$USER` (in our -example: `/home/lennart`), thus dropping the suffix while the home directory is -active. `systemd-homed` will automatically discover home directories named this -way in `/home/*.homedir` and synthesize NSS user records for them as they show -up. +`systemd-homed.service` by the user name, suffixed with `.homedir` +(example: `lennart.homedir` for a user `lennart`) but this is not enforced. +When the user is logged in, the directory is generally mounted to `/home/$USER` +(in our example: `/home/lennart`), thus dropping the suffix while the home directory is active. +`systemd-homed` will automatically discover home directories named this +way in `/home/*.homedir` and synthesize NSS user records for them as they show up. ## Storage Mechanism: `fscrypt` Directories This storage mechanism is mostly identical to the plain directory storage -mechanism, except that the home directory is encrypted using `fscrypt`. (Use -`--storage=fscrypt` on the `homectl` command line.) Key management is -implemented via extended attributes on the directory itself: for each password -an extended attribute `trusted.fscrypt_slot0`, `trusted.fscrypt_slot1`, -`trusted.fscrypt_slot2`, … is maintained. Its value contains a colon-separated -pair of Base64 encoded data fields. The first field contains a salt value, the -second field the encrypted volume key. The latter is encrypted using AES256 in -counter mode, using a key derived from the password via PBKDF2-HMAC-SHA512, -together with the salt value. The construction is similar to what LUKS does for -`dm-crypt` encrypted volumes. Note that extended attributes are not encrypted -by `fscrypt` and hence are suitable for carrying the key slots. Moreover, by -using extended attributes, the slots are directly attached to the directory and +mechanism, except that the home directory is encrypted using `fscrypt`. +(Use `--storage=fscrypt` on the `homectl` command line.) +Key management is implemented via extended attributes on the directory itself: +for each password an extended attribute `trusted.fscrypt_slot0`, `trusted.fscrypt_slot1`, +`trusted.fscrypt_slot2`, … is maintained. +Its value contains a colon-separated pair of Base64 encoded data fields. +The first field contains a salt value, the second field the encrypted volume key. +The latter is encrypted using AES256 in counter mode, using a key derived from the password via PBKDF2-HMAC-SHA512, +together with the salt value. +The construction is similar to what LUKS does for`dm-crypt` encrypted volumes. +Note that extended attributes are not encrypted by `fscrypt` and hence are suitable for carrying the key slots. +Moreover, by using extended attributes, the slots are directly attached to the directory and an independent sidecar key database is not required. ## Storage Mechanism: `cifs` Home Directories In this storage mechanism, the home directory is mounted from a CIFS server and -service at login, configured inside the user record. (Use `--storage=cifs` on -the `homectl` command line.) The local password of the user is used to log into -the CIFS service. The directory share needs to contain the user record in -`~/.identity` as well. Note that this means that the user record needs to be -registered locally before it can be mounted for the first time, since CIFS -domain and server information needs to be known *before* the mount. Note that -for all other storage mechanisms it is entirely sufficient if the directories +service at login, configured inside the user record. +(Use `--storage=cifs` on the `homectl` command line.) +The local password of the user is used to log into the CIFS service. +The directory share needs to contain the user record in `~/.identity` as well. +Note that this means that the user record needs to be registered locally before it can be mounted for the first time, +since CIFS domain and server information needs to be known *before* the mount. +Note that for all other storage mechanisms it is entirely sufficient if the directories or storage artifacts are placed at the right locations — all information to activate them can be derived automatically from their mere availability. ## Storage Mechanism: `luks` Home Directories This is the most advanced and most secure storage mechanism and consists of a -Linux file system inside a LUKS2 volume inside a loopback file (or on removable -media). (Use `--storage=luks` on the `homectl` command line.) Specifically: - -* The image contains a GPT partition table. For now it should only contain a - single partition, and that partition must have the type UUID - `773f91ef-66d4-49b5-bd83-d683bf40ad16`. Its partition label must be the - user name. - -* This partition must contain a LUKS2 volume, whose label must be the user - name. The LUKS2 volume must contain a LUKS2 token field of type - `systemd-homed`. The JSON data of this token must have a `record` field, - containing a string with base64-encoded data. This data is the JSON user - record, in the same serialization as in `~/.identity`, though encrypted. The - JSON data of this token must also have an `iv` field, which contains a - base64-encoded binary initialization vector for the encryption. The - encryption used is the same as the LUKS2 volume itself uses, unlocked by the +Linux file system inside a LUKS2 volume inside a loopback file (or on removable media). +(Use `--storage=luks` on the `homectl` command line.) Specifically: + +* The image contains a GPT partition table. + For now it should only contain a single partition, + and that partition must have the type UUID + `773f91ef-66d4-49b5-bd83-d683bf40ad16`. + Its partition label must be the user name. + +* This partition must contain a LUKS2 volume, whose label must be the user name. + The LUKS2 volume must contain a LUKS2 token field of type `systemd-homed`. + The JSON data of this token must have a `record` field, containing a string with base64-encoded data. + This data is the JSON user record, in the same serialization as in `~/.identity`, though encrypted. + The JSON data of this token must also have an `iv` field, which contains a + base64-encoded binary initialization vector for the encryption. + The encryption used is the same as the LUKS2 volume itself uses, unlocked by the same volume key, but based on its own IV. -* Inside of this LUKS2 volume must be a Linux file system, one of `ext4`, - `btrfs` and `xfs`. The file system label must be the user name. +* Inside of this LUKS2 volume must be a Linux file system, one of `ext4`, `btrfs` and `xfs`. + The file system label must be the user name. -* This file system should contain a single directory named after the user. This - directory will become the home directory of the user when activated. It - contains a second copy of the user record in the `~/.identity` file, like in - the other storage mechanisms. +* This file system should contain a single directory named after the user. + This directory will become the home directory of the user when activated. + It contains a second copy of the user record in the `~/.identity` file, like in the other storage mechanisms. The image file should reside in a directory `/home/` on the system, -named after the user, suffixed with `.home`. When activated, the container home -directory is mounted to the same path, though with the `.home` suffix dropped — -unless a different mount point is defined in the user record. (e.g.: the -loopback file `/home/waldo.home` is mounted to `/home/waldo` while activated.) +named after the user, suffixed with `.home`. +When activated, the container home directory is mounted to the same path, +though with the `.home` suffix dropped — unless a different mount point is defined in the user record. +(e.g.: the loopback file `/home/waldo.home` is mounted to `/home/waldo` while activated.) When the image is stored on removable media (such as a USB stick), the image -file can be directly `dd`'ed onto it; the format is unchanged. The GPT envelope -should ensure the image is properly recognizable as a home directory both when -used in a loopback file and on a removable USB stick. (Note that when mounting -a home directory from a USB stick, it too defaults to a directory in `/home/`, +file can be directly `dd`'ed onto it; the format is unchanged. +The GPT envelope should ensure the image is properly recognizable as a home directory both when +used in a loopback file and on a removable USB stick. +(Note that when mounting a home directory from a USB stick, it too defaults to a directory in `/home/`, named after the username, with no further suffix.) -Rationale for the GPT partition table envelope: this way the image is nicely -discoverable and recognizable already by partition managers as a home -directory. Moreover, when copied onto a USB stick the GPT envelope makes sure -the stick is properly recognizable as a portable home directory -medium. (Moreover, it allows embedding additional partitions later on, for -example on a multi-purpose USB stick that contains both a home -directory and a generic storage volume.) +Rationale for the GPT partition table envelope: +this way the image is nicely discoverable and recognizable already by partition managers as a home directory. +Moreover, when copied onto a USB stick the GPT envelope makes sure +the stick is properly recognizable as a portable home directory medium. +(Moreover, it allows embedding additional partitions later on, for +example on a multi-purpose USB stick that contains both a home directory and a generic storage volume.) Rationale for including the encrypted user record in the LUKS2 header: Linux kernel file system implementations are generally not robust towards maliciously formatted file systems; there's a good chance that file system -images can be used as attack vectors, exploiting the kernel. Thus it is -necessary to validate the home directory image *before* mounting it and -establishing a minimal level of trust. Since the user record data is -cryptographically signed and user records not signed with a recognized private -key are not accepted, a minimal level of trust between the system and the home -directory image is established. +images can be used as attack vectors, exploiting the kernel. +Thus it is necessary to validate the home directory image *before* mounting it and establishing a minimal level of trust. +Since the user record data is cryptographically signed and user records not signed with a recognized private +key are not accepted, a minimal level of trust between the system and the homedirectory image is established. Rationale for storing the home directory one level below to root directory of -the contained file system: this way special directories such as `lost+found/` -do not show up in the user's home directory. +the contained file system: +this way special directories such as `lost+found/` do not show up in the user's home directory. ## Algorithm Regardless of the storage mechanism used, an activated home directory -necessarily involves a mount point to be established. In case of the -directory-based storage mechanisms (`directory`, `subvolume` and `fscrypt`) -this is a bind mount. In case of `cifs` this is a CIFS network mount, and in -case of the LUKS2 backend a regular block device mount of the file system -contained in the LUKS2 image. By requiring a mount for all cases (even for -those that already are a directory), a clear logic is defined to distinguish -active and inactive home directories, so that the directories become -inaccessible under their regular path the instant they are -deactivated. Moreover, the `nosuid`, `nodev` and `noexec` flags configured in -the user record are applied when the bind mount is established. +necessarily involves a mount point to be established. +In case of the directory-based storage mechanisms (`directory`, `subvolume` and `fscrypt`) this is a bind mount. +In case of `cifs` this is a CIFS network mount, and in case of the LUKS2 backend a regular block device mount of the file system +contained in the LUKS2 image. +By requiring a mount for all cases (even for those that already are a directory), +a clear logic is defined to distinguish active and inactive home directories, +so that the directories become inaccessible under their regular path the instant they are deactivated. +Moreover, the `nosuid`, `nodev` and `noexec` flags configured in the user record are applied when the bind mount is established. During activation, the user records retained on the host, the user record stored in the LUKS2 header (in case of the LUKS2 storage mechanism) and the -user record stored inside the home directory in `~/.identity` are -compared. Activation is only permitted if they match the same user and are -signed by a recognized key. When the three instances differ in `lastChangeUSec` -field, the newest record wins, and is propagated to the other two locations. +user record stored inside the home directory in `~/.identity` are compared. +Activation is only permitted if they match the same user and are signed by a recognized key. +When the three instances differ in `lastChangeUSec` field, the newest record wins, and is propagated to the other two locations. During activation, the file system checker (`fsck`) appropriate for the selected file system is automatically invoked, ensuring the file system is in a diff --git a/docs/INCOMPATIBILITIES.md b/docs/INCOMPATIBILITIES.md index 75b60b6..332f1ef 100644 --- a/docs/INCOMPATIBILITIES.md +++ b/docs/INCOMPATIBILITIES.md @@ -7,13 +7,17 @@ SPDX-License-Identifier: LGPL-2.1-or-later # Compatibility with SysV -systemd provides a fair degree of compatibility with the behavior exposed by the SysV init system as implemented by many distributions. Compatibility is provided both for the user experience and the SysV scripting APIs. However, there are some areas where compatibility is limited due to technical reasons or design decisions of systemd and the distributions. All of the following applies to SysV init scripts handled by systemd, however a number of them matter only on specific distributions. Many of the incompatibilities are specific to distribution-specific extensions of LSB/SysV init. +systemd provides a fair degree of compatibility with the behavior exposed by the SysV init system as implemented by many distributions. +Compatibility is provided both for the user experience and the SysV scripting APIs. +However, there are some areas where compatibility is limited due to technical reasons or design decisions of systemd and the distributions. +All of the following applies to SysV init scripts handled by systemd, however a number of them matter only on specific distributions. +Many of the incompatibilities are specific to distribution-specific extensions of LSB/SysV init. * If your distribution removes SysV init scripts in favor of systemd unit files typing "/etc/init.d/foobar start" to start a service will not work, since the script will not be available. Use the more correct "/sbin/service foobar start" instead, and your command will be forwarded to systemd. Note that invoking the init script directly has always been suboptimal since too much of the caller's execution context (environment block, umask, resource limits, audit trails, ...) ended up being inherited by the service, and invocation via "/sbin/service" used to clean this up at least partially. Invocation via /sbin/service works on both SysV and systemd systems. Also, LSB only standardizes invocation via "/sbin/service" anyway. (Note that some distributions ship both systemd unit files and SysV scripts for the services. For these invoking the init scripts will work as expected and the request be forwarded to systemd in any case.) * LSB header dependency information matters. The SysV implementations on many distributions did not use the dependency information encoded in LSB init script headers, or used them only in very limited ways. Due to that they are often incorrect or incomplete. systemd however fully interprets these headers and follows them closely at runtime (and not at installation time like some implementations). * Timeouts apply to all init script operations in systemd. While on SysV systems a hanging init script could freeze the system on systemd all init script operations are subject to a timeout of 5min. * Services are executed in completely clean execution contexts, no context of the invoking user session is inherited. Not even $HOME or similar are set. Init scripts depending on these will not work correctly. -* Services cannot read from stdin, as this will be connected to /dev/null. That means interactive init scripts are not supported (i.e. Debian's X-Interactive in the LSB header is not supported either.) Thankfully most distributions do not support interaction in init scripts anyway. If you need interaction to ask disk or SSL passphrases please consider using the minimal password querying framework systemd supports. ([details](PASSWORD_AGENTS), [manual page](http://0pointer.de/public/systemd-man/systemd-ask-password.html)) +* Services cannot read from stdin, as this will be connected to /dev/null. That means interactive init scripts are not supported (i.e. Debian's X-Interactive in the LSB header is not supported either.) Thankfully most distributions do not support interaction in init scripts anyway. If you need interaction to ask disk or SSL passphrases please consider using the minimal password querying framework systemd supports. ([details](/PASSWORD_AGENTS), [manual page](http://0pointer.de/public/systemd-man/systemd-ask-password.html)) * Additional verbs for init scripts are not supported. If your init script traditionally supported additional verbs for your init script simply move them to an auxiliary script. * Additional parameters to the standard verbs (i.e. to "start", "stop" and "status") are not supported. This was an extension of SysV that never was standardized officially, and is not supported in systemd. * Overriding the "restart" verb is not supported. This verb is always implemented by systemd itself, and consists of a "stop" followed by a "start". diff --git a/docs/INHIBITOR_LOCKS.md b/docs/INHIBITOR_LOCKS.md index 61efdc2..7dafc5e 100644 --- a/docs/INHIBITOR_LOCKS.md +++ b/docs/INHIBITOR_LOCKS.md @@ -10,12 +10,16 @@ SPDX-License-Identifier: LGPL-2.1-or-later systemd 183 and newer include a logic to inhibit system shutdowns and sleep states. This is implemented as part of [systemd-logind.daemon(8)](http://www.freedesktop.org/software/systemd/man/systemd-logind.service.html) There are a couple of different use cases for this: - A CD burning application wants to ensure that the system is not turned off or suspended while the burn process is in progress. + - A package manager wants to ensure that the system is not turned off while a package upgrade is in progress. + - An office suite wants to be notified before system suspend in order to save all data to disk, and delay the suspend logic until all data is written. + - A web browser wants to be notified before system hibernation in order to free its cache to minimize the amount of memory that needs to be virtualized. + - A screen lock tool wants to bring up the screen lock right before suspend, and delay the suspend until that's complete. -Applications which want to make use of the inhibition logic shall take an inhibitor lock via the [logind D-Bus API](http://www.freedesktop.org/wiki/Software/systemd/logind). +Applications which want to make use of the inhibition logic shall take an inhibitor lock via the [logind D-Bus API](https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.login1.html). Seven distinct inhibitor lock types may be taken, or a combination of them: @@ -31,8 +35,12 @@ Seven distinct inhibitor lock types may be taken, or a combination of them: Two different modes of locks are supported: -1. _block_ inhibits operations entirely until the lock is released. If such a lock is taken the operation will fail (but still may be overridden if the user possesses the necessary privileges). -2. _delay_ inhibits operations only temporarily, either until the lock is released or up to a certain amount of time. The InhibitDelayMaxSec= setting in [logind.conf(5)](http://www.freedesktop.org/software/systemd/man/logind.conf.html) controls the timeout for this. This is intended to be used by applications which need a synchronous way to execute actions before system suspend but shall not be allowed to block suspend indefinitely. This mode is only available for _sleep_ and _shutdown_ locks. +1. _block_ inhibits operations entirely until the lock is released. +If such a lock is taken the operation will fail (but still may be overridden if the user possesses the necessary privileges). + +2. _delay_ inhibits operations only temporarily, either until the lock is released or up to a certain amount of time. +The InhibitDelayMaxSec= setting in [logind.conf(5)](http://www.freedesktop.org/software/systemd/man/logind.conf.html) controls the timeout for this. This is intended to be used by applications which need a synchronous way to execute actions before system suspend but shall not be allowed to block suspend indefinitely. +This mode is only available for _sleep_ and _shutdown_ locks. Inhibitor locks are taken via the Inhibit() D-Bus call on the logind Manager object: @@ -71,17 +79,38 @@ node /org/freedesktop/login1 { - _Why_ is a human-readable, descriptive string of why the lock is taken. Example: "Package Update in Progress" - _Mode_ is one of `block` or `delay`, see above. Example: "block" -Inhibit() returns a single value, a file descriptor that encapsulates the lock. As soon as the file descriptor is closed (and all its duplicates) the lock is automatically released. If the client dies while the lock is taken the kernel automatically closes the file descriptor so that the lock is automatically released. A delay lock taken this way should be released ASAP on reception of PrepareForShutdown(true) (see below), but of course only after execution of the actions the application wanted to delay the operation for in the first place. +Inhibit() returns a single value, a file descriptor that encapsulates the lock. +As soon as the file descriptor is closed (and all its duplicates) the lock is automatically released. +If the client dies while the lock is taken the kernel automatically closes the file descriptor so that the lock is automatically released. + +A delay lock taken this way should be released ASAP on reception of PrepareForShutdown(true) (see below), but of course only after execution of the actions the application wanted to delay the operation for in the first place. **ListInhibitors()** lists all currently active inhibitor locks. It returns an array of structs, each consisting of What, Who, Why, Mode as above, plus the PID and UID of the process that requested the lock. -The **PrepareForShutdown()** and **PrepareForSleep()** signals are emitted when a system suspend or shutdown has been requested and is about to be executed, as well as after the the suspend/shutdown was completed (or failed). The signals carry a boolean argument. If _True_ the shutdown/sleep has been requested, and the preparation phase for it begins, if _False_ the operation has finished completion (or failed). If _True_, this should be used as indication for applications to quickly execute the operations they wanted to execute before suspend/shutdown and then release any delay lock taken. If _False_ the suspend/shutdown operation is over, either successfully or unsuccessfully (of course, this signal will never be sent if a shutdown request was successful). The signal with _False_ is generally delivered only after the system comes back from suspend, the signal with _True_ possibly as well, for example when no delay lock was taken in the first place, and the system suspend hence executed without any delay. The signal with _False_ is usually the signal on which applications request a new delay lock in order to be synchronously notified about the next suspend/shutdown cycle. Note that watching PrepareForShutdown(true)[?](//secure.freedesktop.org/write/www/ikiwiki.cgi?do=create&from=Software%2Fsystemd%2Finhibit&page=Software%2Fsystemd%2Finhibit%2FPrepareForSleep)/PrepareForSleep(true) without taking a delay lock is racy and should not be done, as any code that an application might want to execute on this signal might not actually finish before the suspend/shutdown cycle is executed. _Again_: if you watch PrepareForSuspend(true), then you really should have taken a delay lock first. PrepareForShutdown(false) may be subscribed to by applications which want to be notified about system resume events. Note that this will only be sent out for suspend/resume cycles done via logind, i.e. generally only for high-level user-induced suspend cycles, and not automatic, low-level kernel induced ones which might exist on certain devices with more aggressive power management. +The **PrepareForShutdown()** and **PrepareForSleep()** signals are emitted when a system suspend or shutdown has been requested and is about to be executed, as well as after the the suspend/shutdown was completed (or failed). + +The signals carry a boolean argument. +If _True_ the shutdown/sleep has been requested, and the preparation phase for it begins, if _False_ the operation has finished completion (or failed). + +If _True_, this should be used as indication for applications to quickly execute the operations they wanted to execute before suspend/shutdown and then release any delay lock taken. +If _False_ the suspend/shutdown operation is over, either successfully or unsuccessfully (of course, this signal will never be sent if a shutdown request was successful). + +The signal with _False_ is generally delivered only after the system comes back from suspend, the signal with _True_ possibly as well, for example when no delay lock was taken in the first place, and the system suspend hence executed without any delay. + +The signal with _False_ is usually the signal on which applications request a new delay lock in order to be synchronously notified about the next suspend/shutdown cycle. + +Note that watching PrepareForShutdown(true)[?](//secure.freedesktop.org/write/www/ikiwiki.cgi?do=create&from=Software%2Fsystemd%2Finhibit&page=Software%2Fsystemd%2Finhibit%2FPrepareForSleep)/PrepareForSleep(true) without taking a delay lock is racy and should not be done, as any code that an application might want to execute on this signal might not actually finish before the suspend/shutdown cycle is executed. + +_Again_: if you watch PrepareForSuspend(true), then you really should have taken a delay lock first. PrepareForShutdown(false) may be subscribed to by applications which want to be notified about system resume events. + +Note that this will only be sent out for suspend/resume cycles done via logind, i.e. generally only for high-level user-induced suspend cycles, and not automatic, low-level kernel induced ones which might exist on certain devices with more aggressive power management. The **BlockInhibited** and **DelayInhibited** properties encode what types of locks are currently taken. These fields are a colon separated list of `shutdown`, `sleep`, `idle`, `handle-power-key`, `handle-suspend-key`, `handle-hibernate-key`, `handle-lid-switch`. The list is basically the union of the What fields of all currently active locks of the specific mode. **InhibitDelayMaxUSec** contains the delay timeout value as configured in [logind.conf(5)](http://www.freedesktop.org/software/systemd/man/logind.conf.html). -The **PreparingForShutdown** and **PreparingForSleep** boolean properties are true between the two PrepareForShutdown() resp PrepareForSleep() signals that are sent out. Note that these properties do not trigger PropertyChanged signals. +The **PreparingForShutdown** and **PreparingForSleep** boolean properties are true between the two PrepareForShutdown() resp PrepareForSleep() signals that are sent out. +Note that these properties do not trigger PropertyChanged signals. ## Taking Blocking Locks @@ -141,7 +170,17 @@ onPrepareForSleep(bool b) { ## Taking Key Handling Locks -By default logind will handle the power and sleep keys of the machine, as well as the lid switch in all states. This ensures that this basic system behavior is guaranteed to work in all circumstances, on text consoles as well as on all graphical environments. However, some DE might want to do their own handling of these keys, for example in order to show a pretty dialog box before executing the relevant operation, or to simply disable the action under certain conditions. For these cases the handle-power-key, handle-suspend-key, handle-hibernate-key and handle-lid-switch type inhibitor locks are available. When taken, these locks simply disable the low-level handling of the keys, they have no effect on system suspend/hibernate/poweroff executed with other mechanisms than the hardware keys (such as the user typing "systemctl suspend" in a shell). A DE intending to do its own handling of these keys should simply take the locks at login time, and release them on logout; alternatively it might make sense to take this lock only temporarily under certain circumstances (e.g. take the lid switch lock only when a second monitor is plugged in, in order to support the common setup where people close their laptops when they have the big screen connected). +By default logind will handle the power and sleep keys of the machine, as well as the lid switch in all states. + +This ensures that this basic system behavior is guaranteed to work in all circumstances, on text consoles as well as on all graphical environments. + +However, some DE might want to do their own handling of these keys, for example in order to show a pretty dialog box before executing the relevant operation, or to simply disable the action under certain conditions. +For these cases the handle-power-key, handle-suspend-key, handle-hibernate-key and handle-lid-switch type inhibitor locks are available. + +When taken, these locks simply disable the low-level handling of the keys, they have no effect on system suspend/hibernate/poweroff executed with other mechanisms than the hardware keys (such as the user typing "systemctl suspend" in a shell). + +A DE intending to do its own handling of these keys should simply take the locks at login time, and release them on logout; alternatively it might make sense to take this lock only temporarily under certain circumstances +(e.g. take the lid switch lock only when a second monitor is plugged in, in order to support the common setup where people close their laptops when they have the big screen connected). These locks need to be taken in the "block" mode, "delay" is not supported for them. @@ -149,12 +188,27 @@ If a DE wants to ensure the lock screen for the eventual resume is on the screen ## Miscellanea -Taking inhibitor locks is a privileged operation. Depending on the action _org.freedesktop.login1.inhibit-block-shutdown_, _org.freedesktop.login1.inhibit-delay-shutdown_, _org.freedesktop.login1.inhibit-block-sleep_, _org.freedesktop.login1.inhibit-delay-sleep_, _org.freedesktop.login1.inhibit-block-idle_, _org.freedesktop.login1.inhibit-handle-power-key_, _org.freedesktop.login1.inhibit-handle-suspend-key_, _org.freedesktop.login1.inhibit-handle-hibernate-key_,_org.freedesktop.login1.inhibit-handle-lid-switch_. In general it should be assumed that delay locks are easier to obtain than blocking locks, simply because their impact is much more minimal. Note that the policy checks for Inhibit() are never interactive. +Taking inhibitor locks is a privileged operation. Depending on the action _org.freedesktop.login1.inhibit-block-shutdown_, _org.freedesktop.login1.inhibit-delay-shutdown_, _org.freedesktop.login1.inhibit-block-sleep_, _org.freedesktop.login1.inhibit-delay-sleep_, _org.freedesktop.login1.inhibit-block-idle_, _org.freedesktop.login1.inhibit-handle-power-key_, _org.freedesktop.login1.inhibit-handle-suspend-key_, _org.freedesktop.login1.inhibit-handle-hibernate-key_,_org.freedesktop.login1.inhibit-handle-lid-switch_. + +In general it should be assumed that delay locks are easier to obtain than blocking locks, simply because their impact is much more minimal. +Note that the policy checks for Inhibit() are never interactive. -Inhibitor locks should not be misused. For example taking idle blocking locks without a very good reason might cause mobile devices to never auto-suspend. This can be quite detrimental for the battery. +Inhibitor locks should not be misused. +For example taking idle blocking locks without a very good reason might cause mobile devices to never auto-suspend. +This can be quite detrimental for the battery. If an application finds a lock denied it should not consider this much of an error and just continue its operation without the protecting lock. The tool [systemd-inhibit(1)](http://www.freedesktop.org/software/systemd/man/systemd-inhibit.html) may be used to take locks or list active locks from the command line. -Note that gnome-session also provides an [inhibitor API](http://people.gnome.org/~mccann/gnome-session/docs/gnome-session.html#org.gnome.SessionManager.Inhibit), which is very similar to the one of systemd. Internally, locks taken on gnome-session's interface will be forwarded to logind, hence both APIs are supported. While both offer similar functionality they do differ in some regards. For obvious reasons gnome-session can offer logout locks and screensaver avoidance locks which logind lacks. logind's API OTOH supports delay locks in addition to block locks like GNOME. Also, logind is available to system components, and centralizes locks from all users, not just those of a specific one. In general: if in doubt it is probably advisable to stick to the GNOME locks, unless there is a good reason to use the logind APIs directly. When locks are to be enumerated it is better to use the logind APIs however, since they also include locks taken by system services and other users. +Note that gnome-session also provides an [inhibitor API](http://people.gnome.org/~mccann/gnome-session/docs/gnome-session.html#org.gnome.SessionManager.Inhibit), which is very similar to the one of systemd. +Internally, locks taken on gnome-session's interface will be forwarded to logind, hence both APIs are supported. + +While both offer similar functionality they do differ in some regards. +For obvious reasons gnome-session can offer logout locks and screensaver avoidance locks which logind lacks. + +logind's API OTOH supports delay locks in addition to block locks like GNOME. +Also, logind is available to system components, and centralizes locks from all users, not just those of a specific one. + +In general: if in doubt it is probably advisable to stick to the GNOME locks, unless there is a good reason to use the logind APIs directly. +When locks are to be enumerated it is better to use the logind APIs however, since they also include locks taken by system services and other users. diff --git a/docs/INITRD_INTERFACE.md b/docs/INITRD_INTERFACE.md index 0461ae2..402b6a9 100644 --- a/docs/INITRD_INTERFACE.md +++ b/docs/INITRD_INTERFACE.md @@ -26,8 +26,7 @@ Arch Linux initrds. * It's highly recommended that the initrd also mounts `/usr/` (if split off) as appropriate and passes it pre-mounted to the main system, to avoid the - problems described in [Booting without /usr is - Broken](https://www.freedesktop.org/wiki/Software/systemd/separate-usr-is-broken). + problems described in [Booting without /usr is Broken](/SEPARATE_USR_IS_BROKEN). * If the executable `/run/initramfs/shutdown` exists systemd will use it to jump back into the initrd on shutdown. `/run/initramfs/` should be a usable @@ -36,11 +35,11 @@ Arch Linux initrds. example was needed to mount the root file system. It's the job of the initrd to set up this directory and executable in the right way so that this works correctly. The shutdown binary is invoked with the shutdown verb as `argv[1]`, - optionally followed (in `argv[2]`, `argv[3]`, … systemd's original command + optionally followed (in `argv[2]`, `argv[3]`, …) systemd's original command line options, for example `--log-level=` and similar. * Storage daemons run from the initrd should follow the guide on - [systemd and Storage Daemons for the Root File System](ROOT_STORAGE_DAEMONS) + [systemd and Storage Daemons for the Root File System](/ROOT_STORAGE_DAEMONS) to survive properly from the boot initrd all the way to the point where systemd jumps back into the initrd for shutdown. @@ -67,4 +66,4 @@ systemd. Here are a few terse notes: * The switch-root operation will result in a killing spree of all running processes. Some processes might need to be excluded from that, see the guide - on [systemd and Storage Daemons for the Root File System](ROOT_STORAGE_DAEMONS). + on [systemd and Storage Daemons for the Root File System](/ROOT_STORAGE_DAEMONS). diff --git a/docs/JOURNAL_EXPORT_FORMATS.md b/docs/JOURNAL_EXPORT_FORMATS.md index e1eb0d3..0da5252 100644 --- a/docs/JOURNAL_EXPORT_FORMATS.md +++ b/docs/JOURNAL_EXPORT_FORMATS.md @@ -11,16 +11,28 @@ SPDX-License-Identifier: LGPL-2.1-or-later _Note that this document describes the binary serialization format of journals only, as used for transfer across the network. For interfacing with web technologies there's the Journal JSON Format, described below. -The binary format on disk is documented as the [Journal File Format](JOURNAL_FILE_FORMAT)._ +The binary format on disk is documented as the [Journal File Format](/JOURNAL_FILE_FORMAT)._ _Before reading on, please make sure you are aware of the [basic properties of journal entries](https://www.freedesktop.org/software/systemd/man/systemd.journal-fields.html), in particular realize that they may include binary non-text data (though usually don't), and the same field might have multiple values assigned within the same entry (though usually hasn't)._ -When exporting journal data for other uses or transferring it via the network/local IPC the _journal export format_ is used. It's a simple serialization of journal entries, that is easy to read without any special tools, but still binary safe where necessary. The format is like this: +When exporting journal data for other uses or transferring it via the network/local IPC the _journal export format_ is used. +It's a simple serialization of journal entries, that is easy to read without any special tools, but still binary safe where necessary. +The format is like this: * Two journal entries that follow each other are separated by a double newline. -* Journal fields consisting only of valid non-control UTF-8 codepoints are serialized as they are (i.e. the field name, followed by '=', followed by field data), followed by a newline as separator to the next field. Note that fields containing newlines cannot be formatted like this. Non-control UTF-8 codepoints are the codepoints with value at or above 32 (' '), or equal to 9 (TAB). -* Other journal fields are serialized in a special binary safe way: field name, followed by newline, followed by a binary 64-bit little endian size value, followed by the binary field data, followed by a newline as separator to the next field. -* Entry metadata that is not actually a field is serialized like it was a field, but beginning with two underscores. More specifically, `__CURSOR=`, `__REALTIME_TIMESTAMP=`, `__MONOTONIC_TIMESTAMP=`, `__SEQNUM=`, `__SEQNUM_ID` are introduced this way. Note that these meta-fields are only generated when actual journal files are serialized. They are omitted for entries that do not originate from a journal file (for example because they are transferred for the first time to be stored in one). Or in other words: if you are generating this format you shouldn't care about these special double-underscore fields. But you might find them usable when you deserialize the format generated by us. Additional fields prefixed with two underscores might be added later on, your parser should skip over the fields it does not know. +* Journal fields consisting only of valid non-control UTF-8 codepoints are serialized as they are + (i.e. the field name, followed by '=', followed by field data), followed by a newline as separator to the next field. + Note that fields containing newlines cannot be formatted like this. + Non-control UTF-8 codepoints are the codepoints with value at or above 32 (' '), or equal to 9 (TAB). +* Other journal fields are serialized in a special binary safe way: + field name, followed by newline, followed by a binary 64-bit little endian size value, followed by the binary field data, followed by a newline as separator to the next field. +* Entry metadata that is not actually a field is serialized like it was a field, but beginning with two underscores. + More specifically, `__CURSOR=`, `__REALTIME_TIMESTAMP=`, `__MONOTONIC_TIMESTAMP=`, `__SEQNUM=`, `__SEQNUM_ID` are introduced this way. + Note that these meta-fields are only generated when actual journal files are serialized. + They are omitted for entries that do not originate from a journal file (for example because they are transferred for the first time to be stored in one). + Or in other words: if you are generating this format you shouldn't care about these special double-underscore fields. + But you might find them usable when you deserialize the format generated by us. + Additional fields prefixed with two underscores might be added later on, your parser should skip over the fields it does not know. * The order in which fields appear in an entry is undefined and might be different for each entry that is serialized. And that's already it. @@ -124,16 +136,24 @@ _SOURCE_REALTIME_TIMESTAMP=1423944916372858 _Note that this section describes the JSON serialization format of the journal only, as used for interfacing with web technologies. For binary transfer of journal data across the network there's the Journal Export Format described above. -The binary format on disk is documented as [Journal File Format](JOURNAL_FILE_FORMAT)._ +The binary format on disk is documented as [Journal File Format](/JOURNAL_FILE_FORMAT)._ _Before reading on, please make sure you are aware of the [basic properties of journal entries](https://www.freedesktop.org/software/systemd/man/systemd.journal-fields.html), in particular realize that they may include binary non-text data (though usually don't), and the same field might have multiple values assigned within the same entry (though usually hasn't)._ In most cases the Journal JSON serialization is the obvious mapping of the entry field names (as JSON strings) to the entry field values (also as JSON strings) encapsulated in one JSON object. However, there are a few special cases to handle: -* A field that contains non-printable or non-UTF8 is serialized as a number array instead. This is necessary to handle binary data in a safe way without losing data, since JSON cannot embed binary data natively. Each byte of the binary field will be mapped to its numeric value in the range 0…255. -* The JSON serializer can optionally skip huge (as in larger than a specific threshold) data fields from the JSON object. If that is enabled and a data field is too large, the field name is still included in the JSON object but assigned _null_. -* Within the same entry, Journal fields may have multiple values assigned. This is not allowed in JSON. The serializer will hence create a single JSON field only for these cases, and assign it an array of values (which the can be strings, _null_ or number arrays, see above). -* If the JSON data originates from a journal file it may include the special addressing fields `__CURSOR`, `__REALTIME_TIMESTAMP`, `__MONOTONIC_TIMESTAMP`, `__SEQNUM`, `__SEQNUM_ID`, which contain the cursor string of this entry as string, the realtime/monotonic timestamps of this entry as formatted numeric string of usec since the respective epoch, and the sequence number and associated sequence number ID, both formatted as strings. +* A field that contains non-printable or non-UTF8 is serialized as a number array instead. + This is necessary to handle binary data in a safe way without losing data, since JSON cannot embed binary data natively. + Each byte of the binary field will be mapped to its numeric value in the range 0…255. +* The JSON serializer can optionally skip huge (as in larger than a specific threshold) data fields from the JSON object. + If that is enabled and a data field is too large, the field name is still included in the JSON object but assigned _null_. +* Within the same entry, Journal fields may have multiple values assigned. This is not allowed in JSON. + The serializer will hence create a single JSON field only for these cases, and assign it an array of values + (which the can be strings, _null_ or number arrays, see above). +* If the JSON data originates from a journal file it may include the special addressing fields + `__CURSOR`, `__REALTIME_TIMESTAMP`, `__MONOTONIC_TIMESTAMP`, `__SEQNUM`, `__SEQNUM_ID`, which contain the cursor string of this entry as string, + the realtime/monotonic timestamps of this entry as formatted numeric string of usec since the respective epoch, + and the sequence number and associated sequence number ID, both formatted as strings. Here's an example, illustrating all cases mentioned above. Consider this entry: diff --git a/docs/JOURNAL_FILE_FORMAT.md b/docs/JOURNAL_FILE_FORMAT.md index e0737c5..7d3b039 100644 --- a/docs/JOURNAL_FILE_FORMAT.md +++ b/docs/JOURNAL_FILE_FORMAT.md @@ -8,8 +8,9 @@ SPDX-License-Identifier: LGPL-2.1-or-later # Journal File Format _Note that this document describes the binary on-disk format of journals only. -For interfacing with web technologies there's the [Journal JSON Format](JOURNAL_EXPORT_FORMATS.md#journal-json-format). -For transfer of journal data across the network there's the [Journal Export Format](JOURNAL_EXPORT_FORMATS.md#journal-export-format)._ +For interfacing with web technologies there's the [Journal JSON Format](JOURNAL_EXPORT_FORMATS#journal-json-format). +For transfer of journal data across the network there's the +[Journal Export Format](JOURNAL_EXPORT_FORMATS#journal-export-format)._ The systemd journal stores log data in a binary format with several features: @@ -20,65 +21,62 @@ The systemd journal stores log data in a binary format with several features: * Support for in-line compression * Support for in-line Forward Secure Sealing -This document explains the basic structure of the file format on disk. We are -making this available primarily to allow review and provide documentation. Note -that the actual implementation in the [systemd -codebase](https://github.com/systemd/systemd/blob/main/src/libsystemd/sd-journal/) is the -only ultimately authoritative description of the format, so if this document -and the code disagree, the code is right. That said we'll of course try hard to -keep this document up-to-date and accurate. - -Instead of implementing your own reader or writer for journal files we ask you -to use the [Journal's native C -API](https://www.freedesktop.org/software/systemd/man/sd-journal.html) to access -these files. It provides you with full access to the files, and will not -withhold any data. If you find a limitation, please ping us and we might add -some additional interfaces for you. - -If you need access to the raw journal data in serialized stream form without C -API our recommendation is to make use of the [Journal Export -Format](https://systemd.io/JOURNAL_EXPORT_FORMATS#journal-export-format), which you can -get via `journalctl -o export` or via `systemd-journal-gatewayd`. The export -format is much simpler to parse, but complete and accurate. Due to its -stream-based nature it is not indexed. - -_Or, to put this in other words: this low-level document is probably not what -you want to use as base of your project. You want our [C -API](https://www.freedesktop.org/software/systemd/man/sd-journal.html) instead! +This document explains the basic structure of the file format on disk. +We are making this available primarily to allow review and provide documentation. +Note that the actual implementation in the +[systemd codebase](https://github.com/systemd/systemd/blob/main/src/libsystemd/sd-journal/) +is the only ultimately authoritative description of the format, +so if this document and the code disagree, the code is right. +That said we'll of course try hard to keep this document up-to-date and accurate. + +Instead of implementing your own reader or writer for journal files we ask you to use the +[Journal's native CAPI](https://www.freedesktop.org/software/systemd/man/sd-journal.html) +to access these files. +It provides you with full access to the files, and will not withhold any data. +If you find a limitation, please ping us and we might add some additional interfaces for you. + +If you need access to the raw journal data in serialized stream form without C API our recommendation is to make use of the +[Journal Export Format](JOURNAL_EXPORT_FORMATS#journal-export-format), +which you can get via `journalctl -o export` or via `systemd-journal-gatewayd`. +The export format is much simpler to parse, but complete and accurate. +Due to its stream-based nature it is not indexed. + +_Or, to put this in other words: this low-level document is probably not what you want to use as base of your project. +You want our [C API](https://www.freedesktop.org/software/systemd/man/sd-journal.html) instead! And if you really don't want the C API, then you want the -[Journal Export Format or Journal JSON Format](JOURNAL_EXPORT_FORMATS) -instead! This document is primarily for your entertainment and education. +[Journal Export Format or Journal JSON Format](/JOURNAL_EXPORT_FORMATS) instead! +This document is primarily for your entertainment and education. Thank you!_ -This document assumes you have a basic understanding of the journal concepts, -the properties of a journal entry and so on. If not, please go and read up, -then come back! This is a good opportunity to read about the [basic properties -of journal -entries](https://www.freedesktop.org/software/systemd/man/systemd.journal-fields.html), -in particular realize that they may include binary non-text data (though -usually don't), and the same field might have multiple values assigned within -the same entry. - -This document describes the current format of systemd 246. The documented -format is compatible with the format used in the first versions of the journal, +This document assumes you have a basic understanding of the journal concepts, the properties of a journal entry and so on. +If not, please go and read up, then come back! +This is a good opportunity to read about the +[basic properties of journal entries](https://www.freedesktop.org/software/systemd/man/systemd.journal-fields.html), +in particular realize that they may include binary non-text data (though usually don't), +and the same field might have multiple values assigned within the same entry. + +This document describes the current format of systemd 246. +The documented format is compatible with the format used in the first versions of the journal, but received various compatible and incompatible additions since. -If you are wondering why the journal file format has been created in the first -place instead of adopting an existing database implementation, please have a -look [at this -thread](https://lists.freedesktop.org/archives/systemd-devel/2012-October/007054.html). +If you are wondering why the journal file format has been created in the first place instead of adopting an existing database implementation, +please have a look [at this thread](https://lists.freedesktop.org/archives/systemd-devel/2012-October/007054.html). ## Basics * All offsets, sizes, time values, hashes (and most other numeric values) are 32-bit/64-bit unsigned integers in LE format. * Offsets are always relative to the beginning of the file. -* The 64-bit hash function siphash24 is used for newer journal files. For older files [Jenkins lookup3](https://en.wikipedia.org/wiki/Jenkins_hash_function) is used, more specifically `jenkins_hashlittle2()` with the first 32-bit integer it returns as higher 32-bit part of the 64-bit value, and the second one uses as lower 32-bit part. +* The 64-bit hash function siphash24 is used for newer journal files. + For older files [Jenkins lookup3](https://en.wikipedia.org/wiki/Jenkins_hash_function) is used, + more specifically `jenkins_hashlittle2()` with the first 32-bit integer it returns as higher 32-bit part of the 64-bit value, + and the second one uses as lower 32-bit part. * All structures are aligned to 64-bit boundaries and padded to multiples of 64-bit * The format is designed to be read and written via memory mapping using multiple mapped windows. * All time values are stored in usec since the respective epoch. * Wall clock time values are relative to the Unix time epoch, i.e. January 1st, 1970. (`CLOCK_REALTIME`) -* Monotonic time values are always stored jointly with the kernel boot ID value (i.e. `/proc/sys/kernel/random/boot_id`) they belong to. They tend to be relative to the start of the boot, but aren't for containers. (`CLOCK_MONOTONIC`) +* Monotonic time values are always stored jointly with the kernel boot ID value (i.e. `/proc/sys/kernel/random/boot_id`) they belong to. + They tend to be relative to the start of the boot, but aren't for containers. (`CLOCK_MONOTONIC`) * Randomized, unique 128-bit IDs are used in various locations. These are generally UUID v4 compatible, but this is not a requirement. ## General Rules diff --git a/docs/MINIMAL_BUILDS.md b/docs/MINIMAL_BUILDS.md index faa4f2d..1fc85e7 100644 --- a/docs/MINIMAL_BUILDS.md +++ b/docs/MINIMAL_BUILDS.md @@ -7,12 +7,21 @@ SPDX-License-Identifier: LGPL-2.1-or-later # Minimal Builds -systemd includes a variety of components. The core components are always built (which includes systemd itself, as well as udevd and journald). Many of the other components can be disabled at compile time with configure switches. +systemd includes a variety of components. +The core components are always built (which includes systemd itself, as well as udevd and journald). +Many of the other components can be disabled at compile time with configure switches. -For some uses the configure switches do not provide sufficient modularity. For example, they cannot be used to build only the man pages, or to build only the tmpfiles tool, only detect-virt or only udevd. If such modularity is required that goes beyond what we support in the configure script we can suggest you two options: +For some uses the configure switches do not provide sufficient modularity. +For example, they cannot be used to build only the man pages, or to build only the tmpfiles tool, only detect-virt or only udevd. + +If such modularity is required that goes beyond what we support in the configure script we can suggest you two options: + +1. Build systemd as usual, but pick only the built files you need from the result of "make install DESTDIR=", by using the file listing functionality of your packaging software. +For example: if all you want is the tmpfiles tool, then build systemd normally, and list only /usr/bin/systemd-tmpfiles in the .spec file for your RPM package. +This is simple to do, allows you to pick exactly what you need, but requires a larger number of build dependencies (but not runtime dependencies). -1. Build systemd as usual, but pick only the built files you need from the result of "make install DESTDIR=", by using the file listing functionality of your packaging software. For example: if all you want is the tmpfiles tool, then build systemd normally, and list only /usr/bin/systemd-tmpfiles in the .spec file for your RPM package. This is simple to do, allows you to pick exactly what you need, but requires a larger number of build dependencies (but not runtime dependencies). 2. If you want to reduce the build time dependencies (though only dbus and libcap are needed as build time deps) and you know the specific component you are interested in doesn't need it, then create a dummy .pc file for that dependency (i.e. basically empty), and configure systemd with PKG_CONFIG_PATH set to the path of these dummy .pc files. Then, build only the few bits you need with "make foobar", where foobar is the file you need. - We are open to merging patches for the build system that make more "fringe" components of systemd optional. However, please be aware that in order to keep the complexity of our build system small and its readability high, and to make our lives easier, we will not accept patches that make the minimal core components optional, i.e. systemd itself, journald and udevd. -Note that the .pc file trick mentioned above currently doesn't work for libcap, since libcap doesn't provide a .pc file. We invite you to go ahead and post a patch to libcap upstream to get this corrected. We'll happily change our build system to look for that .pc file then. (a .pc file has been sent to upstream by Bryan Kadzban. It is also available at [http://kdzbn.homelinux.net/libcap-add-pkg-config.patch](http://kdzbn.homelinux.net/libcap-add-pkg-config.patch)). +We are open to merging patches for the build system that make more "fringe" components of systemd optional. However, please be aware that in order to keep the complexity of our build system small and its readability high, and to make our lives easier, we will not accept patches that make the minimal core components optional, i.e. systemd itself, journald and udevd. + +Note that the .pc file trick mentioned above currently doesn't work for libcap, since libcap doesn't provide a .pc file. We invite you to go ahead and post a patch to libcap upstream to get this corrected. We'll happily change our build system to look for that .pc file then. (a .pc file has been sent to upstream by Bryan Kadzban). diff --git a/docs/MY_SERVICE_CANT_GET_REATLIME.md b/docs/MY_SERVICE_CANT_GET_REATLIME.md index 20d31fb..26a2e6e 100644 --- a/docs/MY_SERVICE_CANT_GET_REATLIME.md +++ b/docs/MY_SERVICE_CANT_GET_REATLIME.md @@ -7,22 +7,49 @@ SPDX-License-Identifier: LGPL-2.1-or-later # My Service Can't Get Realtime! -_So, you have a service that requires real-time scheduling. When you run this service on your systemd system it is unable to acquire real-time scheduling, even though it is full root and has all possible privileges. And now you are wondering what is going on and what you can do about it?_ +_So, you have a service that requires real-time scheduling. +When you run this service on your systemd system it is unable to acquire real-time scheduling, +even though it is full root and has all possible privileges. +And now you are wondering what is going on and what you can do about it?_ ## What is Going on? -By default systemd places all system services into their own control groups in the "cpu" hierarchy. This has the benefit that the CPU usage of services with many worker threads or processes (think: Apache with all its gazillion CGIs and stuff) gets roughly the same amount of CPU as a service with very few worker threads (think: MySQL). Instead of evening out CPU _per process_ this will cause CPU to be evened out _per service_. - -Now, the "cpu" cgroup controller of the Linux kernel has one major shortcoming: if a cgroup is created it needs an explicit, absolute RT time budget assigned, or otherwise RT is not available to any process in the group, and an attempt to acquire it will fail with EPERM. systemd will not assign any RT time budgets to the "cpu" cgroups it creates, simply because there is no feasible way to do that, since the budget needs to be specified in absolute time units and comes from a fixed pool. Or in other words: we'd love to assign a budget, but there are no sane values we could use. Thus, in its default configuration RT scheduling is simply not available for any system services. +By default systemd places all system services into their own control groups in the "cpu" hierarchy. +This has the benefit that the CPU usage of services with many worker threads or processes +(think: Apache with all its gazillion CGIs and stuff) +gets roughly the same amount of CPU as a service with very few worker threads (think: MySQL). +Instead of evening out CPU _per process_ this will cause CPU to be evened out _per service_. + +Now, the "cpu" cgroup controller of the Linux kernel has one major shortcoming: +if a cgroup is created it needs an explicit, absolute RT time budget assigned, +or otherwise RT is not available to any process in the group, and an attempt to acquire it will fail with EPERM. +systemd will not assign any RT time budgets to the "cpu" cgroups it creates, +simply because there is no feasible way to do that, +since the budget needs to be specified in absolute time units and comes from a fixed pool. +Or in other words: we'd love to assign a budget, but there are no sane values we could use. +Thus, in its default configuration RT scheduling is simply not available for any system services. ## Working Around the Issue Of course, that's quite a limitation, so here's how you work around this: -* One option is to simply globally turn off that systemd creates a "cpu" cgroup for each of the system services. For that, edit `/etc/systemd/system.conf` and set `DefaultControllers=` to the empty string, then reboot. (An alternative is to disable the "cpu" controller in your kernel, entirely. systemd will not attempt to make use of controllers that aren't available in the kernel.) -* Another option is to turn this off for the specific service only. For that, edit your service file, and add `ControlGroup=cpu:/` to its `[Service]` section. This overrides the default logic for this one service only, and places all its processes back in the root cgroup of the "cpu" hierarchy, which has the full RT budget assigned. -* A third option is to simply assign your service a realtime budget. For that use `ControlGroupAttribute=cpu.rt_runtime_us 500000` in its `[Service]` or suchlike. See [the kernel documentation](http://www.kernel.org/doc/Documentation/scheduler/sched-design-CFS.txt) for details. The latter two options are not available for System V services. A possible solution is to write a small wrapper service file that simply calls the SysV script's start verb in `ExecStart=` and the stop verb in `ExecStop=`. (It also needs to set `RemainAfterExit=1` and `Type=forking`!) - -Note that this all only applies to services. By default, user applications run in the root cgroup of the "cpu" hierarchy, which avoids these problems for normal user applications. - -In the long run we hope that the kernel is fixed to not require an RT budget to be assigned for any cgroup created before a process can acquire RT (i.e. a process' RT budget should be derived from the nearest ancestor cgroup which has a budget assigned, rather than unconditionally its own uninitialized budget.) Ideally, we'd also like to create a per-user cgroup by default, so that users with many processes get roughly the same amount of CPU as users with very few. +* One option is to simply globally turn off that systemd creates a "cpu" cgroup for each of the system services. +For that, edit `/etc/systemd/system.conf` and set `DefaultControllers=` to the empty string, then reboot. +(An alternative is to disable the "cpu" controller in your kernel, entirely. +systemd will not attempt to make use of controllers that aren't available in the kernel.) +* Another option is to turn this off for the specific service only. +For that, edit your service file, and add `ControlGroup=cpu:/` to its `[Service]` section. +This overrides the default logic for this one service only, +and places all its processes back in the root cgroup of the "cpu" hierarchy, which has the full RT budget assigned. +* A third option is to simply assign your service a realtime budget. +For that use `ControlGroupAttribute=cpu.rt_runtime_us 500000` in its `[Service]` or suchlike. +See [the kernel documentation](http://www.kernel.org/doc/Documentation/scheduler/sched-design-CFS.txt) for details. +The latter two options are not available for System V services. +A possible solution is to write a small wrapper service file that simply calls the SysV script's start verb in `ExecStart=` and the stop verb in `ExecStop=`. +(It also needs to set `RemainAfterExit=1` and `Type=forking`!) + +Note that this all only applies to services. +By default, user applications run in the root cgroup of the "cpu" hierarchy, which avoids these problems for normal user applications. + +In the long run we hope that the kernel is fixed to not require an RT budget to be assigned for any cgroup created before a process can acquire RT (i.e. a process' RT budget should be derived from the nearest ancestor cgroup which has a budget assigned, rather than unconditionally its own uninitialized budget.) +Ideally, we'd also like to create a per-user cgroup by default, so that users with many processes get roughly the same amount of CPU as users with very few. diff --git a/docs/OPTIMIZATIONS.md b/docs/OPTIMIZATIONS.md index 3c8ac48..d63d09e 100644 --- a/docs/OPTIMIZATIONS.md +++ b/docs/OPTIMIZATIONS.md @@ -9,44 +9,123 @@ SPDX-License-Identifier: LGPL-2.1-or-later _So you are working on a Linux distribution or appliance and need very fast boot-ups?_ -systemd can already offer boot times of < 1s for the Core OS (userspace only, i.e. only the bits controlled by systemd) and < 2s for a complete up-to-date desktop environments on simpler (but modern, i.e. SSDs) laptops if configured properly (examples: [http://git.fenrus.org/tmp/bootchart-20120512-1036.svg](http://git.fenrus.org/tmp/bootchart-20120512-1036.svg)). In this page we want to suggest a couple of ideas how to achieve that, and if the resulting boot times do not suffice where we believe room for improvements are that we'd like to see implemented sooner or later. If you are interested in investing engineering manpower in systemd to get to even shorter boot times, this list hopefully includes a few good suggestions to start with. +systemd can already offer boot times of < 1s for the Core OS (userspace only, i.e. only the bits controlled by systemd) and < 2s for a complete up-to-date desktop environments on simpler (but modern, i.e. SSDs) laptops if configured properly (examples: [http://git.fenrus.org/tmp/bootchart-20120512-1036.svg](http://git.fenrus.org/tmp/bootchart-20120512-1036.svg)). + +In this page we want to suggest a couple of ideas how to achieve that, and if the resulting boot times do not suffice where we believe room for improvements are that we'd like to see implemented sooner or later. + +If you are interested in investing engineering manpower in systemd to get to even shorter boot times, this list hopefully includes a few good suggestions to start with. Of course, before optimizing you should instrument the boot to generate profiling data, so make sure you know your way around with systemd-bootchart, systemd-analyze and pytimechart! Optimizations without profiling are premature optimizations! -Note that systemd's fast performance is a side effect of its design but wasn't the primary design goal. As it stands now systemd (and Fedora using it) has been optimized very little and still has a lot of room for improvements. There are still many low hanging fruits to pick! +Note that systemd's fast performance is a side effect of its design but wasn't the primary design goal. +As it stands now systemd (and Fedora using it) has been optimized very little and still has a lot of room for improvements. There are still many low hanging fruits to pick! -We are very interested in merging optimization work into systemd upstream. Note however that we are careful not to merge work that would drastically limit the general purpose usefulness or reliability of our code, or that would make systemd harder to maintain. So in case you work on optimizations for systemd, try to keep your stuff mainlineable. If in doubt, ask us. +We are very interested in merging optimization work into systemd upstream. +Note however that we are careful not to merge work that would drastically limit the general purpose usefulness or reliability of our code, or that would make systemd harder to maintain. +So in case you work on optimizations for systemd, try to keep your stuff mainlineable. If in doubt, ask us. -The distributions have adopted systemd to varying levels. While there are many compatibility scripts in the boot process on Debian for example, Fedora has much less (but still too many). For better performance consider disabling these scripts, or using a different distribution. +The distributions have adopted systemd to varying levels. +While there are many compatibility scripts in the boot process on Debian for example, Fedora has much less (but still too many). +For better performance consider disabling these scripts, or using a different distribution. It is our intention to optimize the upstream distributions by default (in particular Fedora) so that these optimizations won't be necessary. However, this will take some time, especially since making these changes is often not trivial when the general purpose usefulness cannot be compromised. What you can optimize (locally) without writing any code: -1. Make sure not to use any fake block device storage technology such as LVM (as installed by default by various distributions, including Fedora) they result in the systemd-udev-settle.service unit to be pulled in. Settling device enumeration is slow, racy and mostly obsolete. Since LVM (still) hasn't been updated to handle Linux' event based design properly, settling device enumeration is still required for it, but it will slow down boot substantially. On Fedora, use "systemctl mask fedora-wait-storage.service fedora-storage-init-late.service fedora-storage-init.service" to get rid of all those storage technologies. Of course, don't try this if you actually installed your system with LVM. (The only fake block device storage technology that currently handles this all properly and doesn't require settling device enumerations is LUKS disk encryption.) -2. Consider bypassing the initrd, if you use one. On Fedora, make sure to install the OS on a plain disk without encryption, and without LVM/RAID/... (encrypted /home is fine) when doing this. Then, simply edit grub.conf and remove the initrd from your configuration, and change the root= kernel command line parameter so that it uses kernel device names instead of UUIDs, i.e. "root=sda5" or what is appropriate for your system. Also specify the root FS type with "rootfstype=ext4" (or as appropriate). Note that using kernel devices names is not really that nice if you have multiple hard disks, but if you are doing this for a laptop (i.e. with a single hdd), this should be fine. Note that you shouldn't need to rebuild your kernel in order to bypass the initrd. Distribution kernels (at least Fedora's) work fine with and without initrd, and systemd supports both ways to be started. -3. Consider disabling SELinux and auditing. We recommend leaving SELinux on, for security reasons, but truth be told you can save 100ms of your boot if you disable it. Use selinux=0 on the kernel cmdline. -4. Consider disabling Plymouth. If userspace boots in less than 1s, a boot splash is hardly useful, hence consider passing plymouth.enable=0 on the kernel command line. Plymouth is generally quite fast, but currently still forces settling device enumerations for graphics cards, which is slow. Disabling plymouth removes this bit of the boot. -5. Consider uninstalling syslog. The journal is used anyway on newer systemd systems, and is usually more than sufficient for desktops, and embedded, and even many servers. Just uninstall all syslog implementations and remember that "journalctl" will get you a pixel perfect copy of the classic /var/log/messages message log. To make journal logs persistent (i.e. so that they aren't lost at boot) make sure to run "mkdir -p /var/log/journal". -6. Consider masking a couple of redundant distribution boot scripts, that artificially slow down the boot. For example, on Fedora it's a good idea to mask fedora-autoswap.service fedora-configure.service fedora-loadmodules.service fedora-readonly.service. Also remove all LVM/RAID/FCOE/iSCSI related packages which slow down the boot substantially even if no storage of the specific kind is used (and if these RPMs can't be removed because some important packages require them, at least mask the respective services). +1. Make sure not to use any fake block device storage technology such as LVM (as installed by default by various distributions, including Fedora) they result in the systemd-udev-settle.service unit to be pulled in. Settling device enumeration is slow, racy and mostly obsolete. Since LVM (still) hasn't been updated to handle Linux' event based design properly, settling device enumeration is still required for it, but it will slow down boot substantially. +On Fedora, use "systemctl mask fedora-wait-storage.service fedora-storage-init-late.service fedora-storage-init.service" to get rid of all those storage technologies. +Of course, don't try this if you actually installed your system with LVM. (The only fake block device storage technology that currently handles this all properly and doesn't require settling device enumerations is LUKS disk encryption.) + +2. Consider bypassing the initrd, if you use one. +On Fedora, make sure to install the OS on a plain disk without encryption, and without LVM/RAID/... (encrypted /home is fine) when doing this. +Then, simply edit grub.conf and remove the initrd from your configuration, and change the root= kernel command line parameter so that it uses kernel device names instead of UUIDs, i.e. "root=sda5" or what is appropriate for your system. +Also specify the root FS type with "rootfstype=ext4" (or as appropriate). +Note that using kernel devices names is not really that nice if you have multiple hard disks, but if you are doing this for a laptop (i.e. with a single hdd), this should be fine. +Note that you shouldn't need to rebuild your kernel in order to bypass the initrd. +Distribution kernels (at least Fedora's) work fine with and without initrd, and systemd supports both ways to be started. + +3. Consider disabling SELinux and auditing. +We recommend leaving SELinux on, for security reasons, but truth be told you can save 100ms of your boot if you disable it. +Use selinux=0 on the kernel cmdline. + +4. Consider disabling Plymouth. If userspace boots in less than 1s, a boot splash is hardly useful, hence consider passing plymouth.enable=0 on the kernel command line. +Plymouth is generally quite fast, but currently still forces settling device enumerations for graphics cards, which is slow. +Disabling plymouth removes this bit of the boot. + +5. Consider uninstalling syslog. The journal is used anyway on newer systemd systems, and is usually more than sufficient for desktops, and embedded, and even many servers. +Just uninstall all syslog implementations and remember that "journalctl" will get you a pixel perfect copy of the classic /var/log/messages message log. +To make journal logs persistent (i.e. so that they aren't lost at boot) make sure to run "mkdir -p /var/log/journal". + +6. Consider masking a couple of redundant distribution boot scripts, that artificially slow down the boot. For example, on Fedora it's a good idea to mask fedora-autoswap.service fedora-configure.service fedora-loadmodules.service fedora-readonly.service. +Also remove all LVM/RAID/FCOE/iSCSI related packages which slow down the boot substantially even if no storage of the specific kind is used (and if these RPMs can't be removed because some important packages require them, at least mask the respective services). + 7. Console output is slow. So if you measure your boot times and ship your system, make sure to use "quiet" on the command line and disable systemd debug logging (if you enabled it before). -8. Consider removing cron from your system and use systemd timer units instead. Timer units currently have no support for calendar times (i.e. cannot be used to spawn things "at 6 am every Monday", but can do "run this every 7 days"), but for the usual /etc/cron.daily/, /etc/cron.weekly/, ... should be good enough, if the time of day of the execution doesn't matter (just add four small service and timer units for supporting these dirs. Eventually we might support these out of the box, but until then, just write your own scriplets for this). + +8. Consider removing cron from your system and use systemd timer units instead. +Timer units currently have no support for calendar times (i.e. cannot be used to spawn things "at 6 am every Monday", but can do "run this every 7 days"), but for the usual /etc/cron.daily/, /etc/cron.weekly/, ... should be good enough, if the time of day of the execution doesn't matter (just add four small service and timer units for supporting these dirs. Eventually we might support these out of the box, but until then, just write your own scriplets for this). + 9. If you work on an appliance, consider disabling readahead collection in the shipped devices, but leave readahead replay enabled. -10. If you work on an appliance, make sure to build all drivers you need into the kernel, since module loading is slow. If you build a distribution at least built all the stuff 90% of all people need into your kernel, i.e. at least USB, AHCI and HDA! + +10. If you work on an appliance, make sure to build all drivers you need into the kernel, since module loading is slow. +If you build a distribution at least built all the stuff 90% of all people need into your kernel, i.e. at least USB, AHCI and HDA! + 11. If it works, use libahci.ignore_sss=1 when booting. + 12. Use a modern desktop that doesn't pull in ConsoleKit anymore. For example GNOME 3.4. -13. Get rid of a local MTA, if you are building a desktop or appliance. I.e. on Fedora remove the sendmail RPMs which are (still!) installed by default. -14. If you build an appliance, don't forget that various components of systemd are optional and may be disabled during build time, see "./configure --help" for details. For example, get rid of the virtual console setup if you never have local console users (this is a major source of slowness, actually). In addition, if you never have local users at all, consider disabling logind. And there are more components that are frequently unnecessary on appliances. -15. This goes without saying: the boot-up gets faster if you started less stuff at boot. So run "systemctl" and check if there's stuff you don't need and disable it, or even remove its package. -16. Don't use debug kernels. Debug kernels are slow. Fedora exclusively uses debug kernels during the development phase of each release. If you care about boot performance, either recompile these kernels with debugging turned off or wait for the final distribution release. It's a drastic difference. That also means that if you publish boot performance data of a Fedora pre-release distribution you are doing something wrong. ;-) So much about the basics of how to get a quick boot. Now, here's an incomprehensive list of things we'd like to see improved in systemd (and elsewhere) over short or long and need a bit of hacking (sometimes more, and sometimes less): -17. Get rid of systemd-cgroups-agent. Currently, whenever a systemd cgroup runs empty a tool "systemd-cgroups-agent" is invoked by the kernel which then notifies systemd about it. The need for this tool should really go away, which will save a number of forked processes at boot, and should make things faster (especially shutdown). This requires introduction of a new kernel interface to get notifications for cgroups running empty, for example via fanotify() on cgroupfs. -18. Make use of EXT4_IOC_MOVE_EXT in systemd's readahead implementation. This allows reordering/defragmentation of the files needed for boot. According to the data from [http://e4rat.sourceforge.net/](http://e4rat.sourceforge.net/) this might shorten the boot time to 40%. Implementation is not trivial, but given that we already support btrfs defragmentation and example code for this exists (e4rat as linked) should be fairly straightforward. -19. Compress readahead pack files with XZ or so. Since boot these days tends to be clearly IO bound (and not CPU bound) it might make sense to reduce the IO load for the pack file by compressing it. Since we already have a dependency on XZ we'd recommend using XZ for this. -20. Update the readahead logic to also precache directories (in addition to files). -21. Improve a couple of algorithms in the unit dependency graph calculation logic, as well as unit file loading. For example, right now when loading units we match them up with a subset of the other loaded units in order to add automatic dependencies between them where appropriate. Usually the set of units matched up is small, but the complexity is currently O(n^2), and this could be optimized. Since unit file loading and calculations in the dependency graphs is the only major, synchronous, computation-intensive bit of PID 1, and is executed before any services are started this should bring relevant improvements, especially on systems with big dependency graphs. -22. Add socket activation to X. Due to the special socket allocation semantics of X this is useful only for display :0. This should allow parallelization of X startup with its clients. -23. The usual housekeeping: get rid of shell-based services (i.e. SysV init scripts), replace them with unit files. Don't make use of Type=forking and ordering dependencies if possible, use socket activation with Type=simple instead. This allows drastically better parallelized start-up for your services. Also, if you cannot use socket activation, at least consider patching your services to support Type=notify in place of Type=forking. Consider making seldom used services activated on-demand (for example, printer services), and start frequently used services already at boot instead of delaying them until they are used. -24. Consider making use of systemd for the session as well, the way Tizen is doing this. This still needs some love in systemd upstream to be a smooth ride, but we definitely would like to go this way sooner or later, even for the normal desktops. -25. Add an option for service units to temporarily bump the CPU and IO priority of the startup code of important services. Note however, that we assume that this will not bring much and hence recommend looking into this only very late. Since boot-up tends to be IO bound, solutions such as readahead are probably more interesting than prioritizing service startup IO. Also, this would probably always require a certain amount of manual configuration since determining automatically which services are important is hard (if not impossible), because we cannot track properly which services other services wait for. -26. Same as the previous item, but temporarily lower the CPU/IO priority of the startups part of unimportant leaf services. This is probably more useful than 11 as it is easier to determine which processes don't matter. -27. Add a kernel sockopt for AF_UNIX to increase the maximum datagram queue length for SOCK_DGRAM sockets. This would allow us to queue substantially more logging datagrams in the syslog and journal sockets, and thus move the point where syslog/journal clients have to block before their message writes finish much later in the boot process. The current kernel default is rather low with 10. (As a temporary hack it is possible to increase /proc/sys/net/unix/max_dgram_qlen globally, but this has implications beyond systemd, and should probably be avoided.) The kernel patch to make this work is most likely trivial. In general, this should allow us to improve the level of parallelization between clients and servers for AF_UNIX sockets of type SOCK_DGRAM or SOCK_SEQPACKET. Again: the list above contains things we'd like to see in systemd anyway. We didn't do much profiling for these features, but we have enough indication to assume that these bits will bring some improvements. But yeah, if you work on this, keep your profiling tools ready at all times. + +14. Get rid of a local MTA, if you are building a desktop or appliance. +I.e. on Fedora remove the sendmail RPMs which are (still!) installed by default. + +15. If you build an appliance, don't forget that various components of systemd are optional and may be disabled during build time, see "./configure --help" for details. +For example, get rid of the virtual console setup if you never have local console users (this is a major source of slowness, actually). +In addition, if you never have local users at all, consider disabling logind. And there are more components that are frequently unnecessary on appliances. + +16. This goes without saying: the boot-up gets faster if you started less stuff at boot. +So run "systemctl" and check if there's stuff you don't need and disable it, or even remove its package. + +17. Don't use debug kernels. Debug kernels are slow. +Fedora exclusively uses debug kernels during the development phase of each release. +If you care about boot performance, either recompile these kernels with debugging turned off or wait for the final distribution release. +It's a drastic difference. That also means that if you publish boot performance data of a Fedora pre-release distribution you are doing something wrong. ;-) So much about the basics of how to get a quick boot. +Now, here's an incomprehensive list of things we'd like to see improved in systemd (and elsewhere) over short or long and need a bit of hacking (sometimes more, and sometimes less): + +18. Get rid of systemd-cgroups-agent. +Currently, whenever a systemd cgroup runs empty a tool "systemd-cgroups-agent" is invoked by the kernel which then notifies systemd about it. +The need for this tool should really go away, which will save a number of forked processes at boot, and should make things faster (especially shutdown). +This requires introduction of a new kernel interface to get notifications for cgroups running empty, for example via fanotify() on cgroupfs. + +19. Make use of EXT4_IOC_MOVE_EXT in systemd's readahead implementation. +This allows reordering/defragmentation of the files needed for boot. +According to the data from [http://e4rat.sourceforge.net/](http://e4rat.sourceforge.net/) this might shorten the boot time to 40%. +Implementation is not trivial, but given that we already support btrfs defragmentation and example code for this exists (e4rat as linked) should be fairly straightforward. + +20. Compress readahead pack files with XZ or so.Since boot these days tends to be clearly IO bound (and not CPU bound) it might make sense to reduce the IO load for the pack file by compressing it. Since we already have a dependency on XZ we'd recommend using XZ for this. + +21. Update the readahead logic to also precache directories (in addition to files). + +22. Improve a couple of algorithms in the unit dependency graph calculation logic, as well as unit file loading. +For example, right now when loading units we match them up with a subset of the other loaded units in order to add automatic dependencies between them where appropriate. +Usually the set of units matched up is small, but the complexity is currently O(n^2), and this could be optimized. Since unit file loading and calculations in the dependency graphs is the only major, synchronous, computation-intensive bit of PID 1, and is executed before any services are started this should bring relevant improvements, especially on systems with big dependency graphs. + +23. Add socket activation to X. Due to the special socket allocation semantics of X this is useful only for display :0. This should allow parallelization of X startup with its clients. + +24. The usual housekeeping: get rid of shell-based services (i.e. SysV init scripts), replace them with unit files. +Don't make use of Type=forking and ordering dependencies if possible, use socket activation with Type=simple instead. +This allows drastically better parallelized start-up for your services. Also, if you cannot use socket activation, at least consider patching your services to support Type=notify in place of Type=forking. Consider making seldom used services activated on-demand (for example, printer services), and start frequently used services already at boot instead of delaying them until they are used. + +25. Consider making use of systemd for the session as well, the way Tizen is doing this. +This still needs some love in systemd upstream to be a smooth ride, but we definitely would like to go this way sooner or later, even for the normal desktops. + +26. Add an option for service units to temporarily bump the CPU and IO priority of the startup code of important services. +Note however, that we assume that this will not bring much and hence recommend looking into this only very late. +Since boot-up tends to be IO bound, solutions such as readahead are probably more interesting than prioritizing service startup IO. Also, this would probably always require a certain amount of manual configuration since determining automatically which services are important is hard (if not impossible), because we cannot track properly which services other services wait for. + +27. Same as the previous item, but temporarily lower the CPU/IO priority of the startups part of unimportant leaf services. +This is probably more useful than 11 as it is easier to determine which processes don't matter. + +28. Add a kernel sockopt for AF_UNIX to increase the maximum datagram queue length for SOCK_DGRAM sockets. +This would allow us to queue substantially more logging datagrams in the syslog and journal sockets, and thus move the point where syslog/journal clients have to block before their message writes finish much later in the boot process. +The current kernel default is rather low with 10. (As a temporary hack it is possible to increase /proc/sys/net/unix/max_dgram_qlen globally, but this has implications beyond systemd, and should probably be avoided.) The kernel patch to make this work is most likely trivial. +In general, this should allow us to improve the level of parallelization between clients and servers for AF_UNIX sockets of type SOCK_DGRAM or SOCK_SEQPACKET. Again: the list above contains things we'd like to see in systemd anyway. +We didn't do much profiling for these features, but we have enough indication to assume that these bits will bring some improvements. +But yeah, if you work on this, keep your profiling tools ready at all times. diff --git a/docs/PASSWORD_AGENTS.md b/docs/PASSWORD_AGENTS.md index 29bd949..297d8ea 100644 --- a/docs/PASSWORD_AGENTS.md +++ b/docs/PASSWORD_AGENTS.md @@ -7,11 +7,18 @@ SPDX-License-Identifier: LGPL-2.1-or-later # Password Agents -systemd 12 and newer support lightweight password agents which can be used to query the user for system-level passwords or passphrases. These are passphrases that are not related to a specific user, but to some kind of hardware or service. Right now this is used exclusively for encrypted hard-disk passphrases but later on this is likely to be used to query passphrases of SSL certificates at Apache startup time as well. The basic idea is that a system component requesting a password entry can simply drop a simple .ini-style file into `/run/systemd/ask-password` which multiple different agents may watch via `inotify()`, and query the user as necessary. The answer is then sent back to the querier via an `AF_UNIX`/`SOCK_DGRAM` socket. Multiple agents might be running at the same time in which case they all should query the user and the agent which answers first wins. Right now systemd ships with the following passphrase agents: +systemd 12 and newer support lightweight password agents which can be used to query the user for system-level passwords or passphrases. +These are passphrases that are not related to a specific user, but to some kind of hardware or service. +Right now this is used exclusively for encrypted hard-disk passphrases but later on this is likely to be used to query passphrases of SSL certificates at Apache startup time as well. +The basic idea is that a system component requesting a password entry can simply drop a simple .ini-style file into `/run/systemd/ask-password` which multiple different agents may watch via `inotify()`, and query the user as necessary. +The answer is then sent back to the querier via an `AF_UNIX`/`SOCK_DGRAM` socket. +Multiple agents might be running at the same time in which case they all should query the user and the agent which answers first wins. +Right now systemd ships with the following passphrase agents: * A Plymouth agent used for querying passwords during boot-up * A console agent used in similar situations if Plymouth is not available -* A GNOME agent which can be run as part of the normal user session which pops up a notification message and icon which when clicked receives the passphrase from the user. This is useful and necessary in case an encrypted system hard-disk is plugged in when the machine is already up. +* A GNOME agent which can be run as part of the normal user session which pops up a notification message and icon which when clicked receives the passphrase from the user. + This is useful and necessary in case an encrypted system hard-disk is plugged in when the machine is already up. * A [`wall(1)`](https://man7.org/linux/man-pages/man1/wall.1.html) agent which sends wall messages as soon as a password shall be entered. * A simple tty agent which is built into "`systemctl start`" (and similar commands) and asks passwords to the user during manual startup of a service * A simple tty agent which can be run manually to respond to all queued passwords @@ -22,20 +29,36 @@ It is easy to write additional agents. The basic algorithm to follow looks like * Ignore all events on files in that directory that do not start with "`ask.`" * As soon as a file named "`ask.xxxx`" shows up, read it. It's a simple `.ini` file that may be parsed with the usual parsers. The `xxxx` suffix is randomized. * Make sure to ignore unknown `.ini` file keys in those files, so that we can easily extend the format later on. -* You'll find the question to ask the user in the `Message=` field in the `[Ask]` section. It is a single-line string in UTF-8, which might be internationalized (by the party that originally asks the question, not by the agent). +* You'll find the question to ask the user in the `Message=` field in the `[Ask]` section. + It is a single-line string in UTF-8, which might be internationalized (by the party that originally asks the question, not by the agent). * You'll find an icon name (following the XDG icon naming spec) to show next to the message in the `Icon=` field in the `[Ask]` section -* You'll find the PID of the client asking the question in the `PID=` field in the `[Ask]` section (Before asking your question use `kill(PID, 0)` and ignore the file if this returns `ESRCH`; there's no need to show the data of this field but if you want to you may) +* You'll find the PID of the client asking the question in the `PID=` field in the `[Ask]` section + (Before asking your question use `kill(PID, 0)` and ignore the file if this returns `ESRCH`; + there's no need to show the data of this field but if you want to you may) * `Echo=` specifies whether the input should be obscured. If this field is missing or is `Echo=0`, the input should not be shown. * The socket to send the response to is configured via `Socket=` in the `[Ask]` section. It is a `AF_UNIX`/`SOCK_DGRAM` socket in the file system. -* Ignore files where the time specified in the `NotAfter=` field in the `[Ask]` section is in the past. The time is specified in usecs, and refers to the `CLOCK_MONOTONIC` clock. If `NotAfter=` is `0`, no such check should take place. +* Ignore files where the time specified in the `NotAfter=` field in the `[Ask]` section is in the past. + The time is specified in usecs, and refers to the `CLOCK_MONOTONIC` clock. If `NotAfter=` is `0`, no such check should take place. * Make sure to hide a password query dialog as soon as a) the `ask.xxxx` file is deleted, watch this with inotify. b) the `NotAfter=` time elapses, if it is set `!= 0`. -* Access to the socket is restricted to privileged users. To acquire the necessary privileges to send the answer back, consider using PolicyKit. In fact, the GNOME agent we ship does that, and you may simply piggyback on that, by executing "`/usr/bin/pkexec /lib/systemd/systemd-reply-password 1 /path/to/socket`" or "`/usr/bin/pkexec /lib/systemd/systemd-reply-password 0 /path/to/socket`" and writing the password to its standard input. Use '`1`' as argument if a password was entered by the user, or '`0`' if the user canceled the request. -* If you do not want to use PK ensure to acquire the necessary privileges in some other way and send a single datagram to the socket consisting of the password string either prefixed with "`+`" or with "`-`" depending on whether the password entry was successful or not. You may but don't have to include a final `NUL` byte in your message. +* Access to the socket is restricted to privileged users. + To acquire the necessary privileges to send the answer back, consider using PolicyKit. + In fact, the GNOME agent we ship does that, and you may simply piggyback on that, by executing "`/usr/bin/pkexec /lib/systemd/systemd-reply-password 1 /path/to/socket`" or "`/usr/bin/pkexec /lib/systemd/systemd-reply-password 0 /path/to/socket`" and writing the password to its standard input. + Use '`1`' as argument if a password was entered by the user, or '`0`' if the user canceled the request. +* If you do not want to use PK ensure to acquire the necessary privileges in some other way and send a single datagram + to the socket consisting of the password string either prefixed with "`+`" or with "`-`" depending on whether the password entry was successful or not. + You may but don't have to include a final `NUL` byte in your message. Again, it is essential that you stop showing the password box/notification/status icon if the `ask.xxx` file is removed or when `NotAfter=` elapses (if it is set `!= 0`)! -It may happen that multiple password entries are pending at the same time. Your agent needs to be able to deal with that. Depending on your environment you may either choose to show all outstanding passwords at the same time or instead only one and as soon as the user has replied to that one go on to the next one. +It may happen that multiple password entries are pending at the same time. +Your agent needs to be able to deal with that. Depending on your environment you may either choose to show all outstanding passwords at the same time or instead only one and as soon as the user has replied to that one go on to the next one. -You may test this all with manually invoking the "`systemd-ask-password`" tool on the command line. Pass `--no-tty` to ensure the password is asked via the agent system. Note that only privileged users may use this tool (after all this is intended purely for system-level passwords). +You may test this all with manually invoking the "`systemd-ask-password`" tool on the command line. +Pass `--no-tty` to ensure the password is asked via the agent system. +Note that only privileged users may use this tool (after all this is intended purely for system-level passwords). -If you write a system level agent a smart way to activate it is using systemd `.path` units. This will ensure that systemd will watch the `/run/systemd/ask-password` directory and spawn the agent as soon as that directory becomes non-empty. In fact, the console, wall and Plymouth agents are started like this. If systemd is used to maintain user sessions as well you can use a similar scheme to automatically spawn your user password agent as well. (As of this moment we have not switched any DE over to use systemd for session management, however.) +If you write a system level agent a smart way to activate it is using systemd `.path` units. +This will ensure that systemd will watch the `/run/systemd/ask-password` directory and spawn the agent as soon as that directory becomes non-empty. +In fact, the console, wall and Plymouth agents are started like this. +If systemd is used to maintain user sessions as well you can use a similar scheme to automatically spawn your user password agent as well. +(As of this moment we have not switched any DE over to use systemd for session management, however.) diff --git a/docs/PAX_CONTROL_GROUPS.md b/docs/PAX_CONTROL_GROUPS.md new file mode 100644 index 0000000..4b2374a --- /dev/null +++ b/docs/PAX_CONTROL_GROUPS.md @@ -0,0 +1,117 @@ +--- +title: Pax Controla Groupiana +category: Users, Groups and Home Directories +layout: default +SPDX-License-Identifier: LGPL-2.1-or-later +--- + +# Pax Controla Groupiana + +_aka "How to behave nicely in the cgroupfs trees"_ + +**Important Update: Please consult this document only as a historical reference. +It was written under the assumption that the cgroups tree was a shared resource. +However, after much discussion this concept has been deemed outdated. +The cgroups tree can no longer be considered a shared resource. +Instead, a management daemon of some kind needs to arbitrate access to it, and it needs to actively propagate changes between the entities it manages. +More specifically, on systemd systems this management daemon is systemd itself, accessible via a number of bus APIs. +This means instead of dealing directly with the low-level interfaces of the cgroup file system, please use systemd's high-level APIs as a replacement, see the +[New Control Group Interfaces](/CONTROL_GROUP_INTERFACE) +for details. They offer similar functionality.** + +Are you writing an application interfacing with the cgroups tree? +The cgroups trees are a shared resource, other applications will use them too. +Here are a few recommendations how to write your application in a way that minimizes conflicts with other applications. +If you follow these guidelines applications should not step on any other application's toes and users will be happy. + +Before you read these recommendations please make sure you understand cgroups thoroughly, +and specifically are aware what a controller is, what a named hierarchy is and so on. + +## Intended Audience + +You should consider these recommendations if you are you working on one of the following: + +- You write a system or session manager based on cgroups (like systemd) +- You write a VM manager based on cgroups (like libvirt) +- You write a terminal application and want to place every shell in a separate cgroup (like gnome-terminal) +- You write a web browser and want to place every renderer in a separate cgroup (like Firefox or Chrome) +- You create a container for some purpose (such as systemd-nspawn) +- Or you use cgroups for any other purpose and want things to work nicely with other applications. + +## General Recommendations + +- If you use one of the kernel controllers, do _not_ assume you are the only one who uses them. + Other programs may manipulate the tree, add cgroups and change group attributes at any time, and they will not inform you about it. + The kernel provided controller hierarchies are a shared resource, so be nice. +- If you use a generic named hierarchy with no controller attached, then you may assume it's yours and only yours, and that no other programs interfere with it. +- If you use a generic named hierarchy with no controller attached, then make sure to name it after your project in order to minimize namespacing conflicts. + A hierarchy named "name=web" is a bit generic. + A hierarchy named "name=apache" a much better choice, if you are an Apache developer and need an entire hierarchy all for yourself. +- Do _not_ assume everybody uses the same library to manipulate the cgroups tree as you are. + In fact most likely most applications and the user himself will manipulate the tree without any further indirection (i.e. will use naked system calls/shell commands) +- Never create cgroups at the top of the tree (i.e. with an absolute path). + If possible find the cgroup your own process was started in and create subgroups only below that group (read /proc/self/cgroup to find it). + If that's not applicable, then at least place yourself below the cgroup path of PID 1 (read /proc/1/cgroup to find it). + This is important to ensure that containers work properly (the cgroupfs tree is currently not virtualized for containers!), and solves permission problems, and makes the whole system nicely stackable. +- A corollary of this: If you spawn subprocesses expect that they will create subcgroups. + That means when terminating there might be subcgroups below the ones you created and you hence need to recursively remove them too. + In fact, many of your operations must probably be executed in a recursive fashion. +- Do not play permission games: if you are an unprivileged user application then it's _not_ your business to ensure you have the right permissions + (i.e. do not include any setuid code in your app to create groups). + Instead your system manager (such as systemd), + should provide you with the right set of permissions on the cgroup you are running in to create subgroups. + Normally that should mean that depending on administrator configuration, you will or will not get access to create subgroups under the cgroup you are running in and the ability to add PIDs to it. + If you don't get access to these hierarchies then this might be a decision by the administrator and you should do your best to go on, and fail gracefully. +- If you create a cgroup, then you are in charge of removing it too after using it. + Do not remove other program's cgroups. + Special exception: in some cases it is OK to pre-set attributes on certain cgroups that are primarily managed by another program. + (Example: in systemd we are fine if you externally pre-create or manipulate service cgroups, for example to make changes to some attributes you cannot control with systemd natively, see below). + In that case: create the cgroup and set the sticky bit (+t) on the tasks file in it. + This will then be used as an indication to the primary manager of the group not to remove the cgroup at the end, in order to avoid that your settings are lost. + This is of course a bit of a misuse of the sticky bit, but given that it serves no other purpose on Linux for normal files, it is an OK use, with a fitting meaning given the name of "sticky bit". +- If you find a process in a cgroup you are about to remove, and it is not yours, consider leaving the cgroup around. + I.e. if rmdir returns EEMPTY, ignore this. +- The cgroup mount point for a specific hierarchy is /sys/fs/cgroup/$CONTROLLER/. + (Example: /sys/fs/cgroup/cpu for the "cpu" controller). + In your application you are welcome to rely on these standardized mount points, + and it is not necessary to dynamically determine the current mount point via /proc/self/mountinfo (but if you do, that's of course fine, too). + Note that /sys/fs/cgroup/$CONTROLLER/ might actually just be a symlink to some other mount point (see below). +- If multiple controllers are mounted into the same hierarchy, it is guaranteed that symlinks exist to make sure all jointly mounted controllers are still available under /sys/fs/cgroup/$CONTROLLER/. + Example: if "cpu" and "cpuacct" are mounted together, then symlinks /sys/fs/cgroup/cpu and /sys/fs/cgroup/cpuacct will point to the joint mountpoint (which could be something like /sys/fs/cgroup/cpu+cpuacct). +- Your application should not mount the cgroup controller file systems (unless it is your own private named hierarchy). + This is exclusively a job for the system manager or a system-wide init script such as cgconfig. + If you work on a system manager or such an init script you must mount the cgroup controllers to /sys/fs/cgroup/$CONTROLLER/ or provide compatibility symlinks. +- It's a good idea not to fail if a cgroup already exists when you try to create it. + Ignore EEXIST on mkdir. +- Avoid renaming cgroups or similar fancier file operations. +- Expect that other programs might readjust the attributes on your cgroups dynamically during runtime. +- When creating a cgroup pick a nice a descriptive name that is guessable and no surprise to the admin. + The admin will thank you for this if he has to read the output of "ps -eo pid,args,cgroups" +- /sys/fs/cgroup is a tmpfs. If you create your own private named hierarchy then you are welcome to mount it into a subdirectory of this directory. + This minimizes surprises for the user. +- /sys/fs/cgroup is a tmpfs, but it's only intended use is to act as place where control group hierarchies can be mounted or symlinked to. + You should not place any other kind of file in this directory. + The same way as /dev/shm is for POSIX shared memory segments only -- and nothing else -- this directory is for cgroup hierarchies only. + Just because something is a tmpfs it doesn't mean you can actually use it for "temporary" files, thank you. +- Avoid creating orthogonal hierarchies in the various kernel controller hierarchies. + Please make sure that the controllers contain the same hierarchy or subsets of each other. + +## Cooperation with systemd + +systemd adheres to the recommendations above and guarantees additional behavior which might be useful for writing applications that cooperate with systemd on cgroup management: + +- If a service cgroup already exists, systemd will make use of it and not recreate it. + (If +t is set on the tasks file it will not remove it when stopping a service, otherwise it will, see above). + It is hence OK to pre-create cgroups and then let systemd use it, without having systemd remove it afterwards. +- If a service cgroup already exists, systemd will not override the attributes of the cgroup with the exception of those explicitly configured in the systemd unit files. + It is hence OK to pre-create cgroups for use in systemd, and pre-apply attributes to it. +- To avoid that systemd places all services in automatic cgroups in the "cpu" hierarchy change the [?](https://secure.freedesktop.org/write/www/ikiwiki.cgi?do=create&from=Software%2Fsystemd%2FPaxControlGroups&page=DefaultControllers) DefaultControllers= in /etc/systemd/system.conf and set it to the empty string. +- By default systemd will place services only in automatic cgroups in the "cpu" hierarchy and in its own private tree "name=systemd". + If you want it to duplicate these trees in other hierarchies add them to [?](https://secure.freedesktop.org/write/www/ikiwiki.cgi?do=create&from=Software%2Fsystemd%2FPaxControlGroups&page=DefaultControllers) DefaultControllers= in /etc/systemd/system.conf +- To opt-out or opt-in specific services from the automatic tree generation in the kernel controller hierarchies use [?](https://secure.freedesktop.org/write/www/ikiwiki.cgi?do=create&from=Software%2Fsystemd%2FPaxControlGroups&page=ControlGroup) ControlGroup= in the unit file. + Use "[?](https://secure.freedesktop.org/write/www/ikiwiki.cgi?do=create&from=Software%2Fsystemd%2FPaxControlGroups&page=ControlGroup) ControlGroup=cpu:/" to opt-out of cgroup assignment for a service or + [?](https://secure.freedesktop.org/write/www/ikiwiki.cgi?do=create&from=Software%2Fsystemd%2FPaxControlGroups&page=ControlGroup) ControlGroup=cpu:/foo/bar" to manipulate the cgroup path. +- Stay away from the name=systemd named hierarchy. + It's private property of systemd. + You are welcome to explore it, but it is uncool to modify it from outside systemd. +Thanks. diff --git a/docs/PORTABILITY_AND_STABILITY.md b/docs/PORTABILITY_AND_STABILITY.md index abdc3dc..e611c3b 100644 --- a/docs/PORTABILITY_AND_STABILITY.md +++ b/docs/PORTABILITY_AND_STABILITY.md @@ -7,23 +7,37 @@ SPDX-License-Identifier: LGPL-2.1-or-later # Interface Portability and Stability Promise -systemd provides various interfaces developers and programs might rely on. Starting with version 26 (the first version released with Fedora 15) we promise to keep a number of them stable and compatible for the future. +systemd provides various interfaces developers and programs might rely on. +Starting with version 26 (the first version released with Fedora 15) we promise to keep a number of them stable and compatible for the future. The stable interfaces are: -* **The unit configuration file format**. Unit files written now will stay compatible with future versions of systemd. Extensions to the file format will happen in a way that existing files remain compatible. +* **The unit configuration file format**. Unit files written now will stay compatible with future versions of systemd. + Extensions to the file format will happen in a way that existing files remain compatible. -* **The command line interface** of `systemd`, `systemctl`, `loginctl`, `journalctl`, and all other command line utilities installed in `$PATH` and documented in a man page. We will make sure that scripts invoking these commands will continue to work with future versions of systemd. Note however that the output generated by these commands is generally not included in the promise, unless it is documented in the man page. Example: the output of `systemctl status` is not stable, but that of `systemctl show` is, because the former is intended to be human readable and the latter computer readable, and this is documented in the man page. +* **The command line interface** of `systemd`, `systemctl`, `loginctl`, `journalctl`, and all other command line utilities installed in `$PATH` and documented in a man page. + We will make sure that scripts invoking these commands will continue to work with future versions of systemd. + Note however that the output generated by these commands is generally not included in the promise, unless it is documented in the man page. + Example: the output of `systemctl status` is not stable, but that of `systemctl show` is, because the former is intended to be human readable and the latter computer readable, and this is documented in the man page. -* **The protocol spoken on the socket referred to by `$NOTIFY_SOCKET`**, as documented in [sd_notify(3)](https://www.freedesktop.org/software/systemd/man/sd_notify.html). +* **The protocol spoken on the socket referred to by `$NOTIFY_SOCKET`**, as documented in + [sd_notify(3)](https://www.freedesktop.org/software/systemd/man/sd_notify.html). Note that, although using + libsystemd is a good choice, this protocol can also be reimplemented without external dependencies, as + demonstrated in the example listed in + [sd_notify(3)](https://www.freedesktop.org/software/systemd/man/devel/sd_notify.html#Notes) -* Some of the **"special" unit names** and their semantics. To be precise the ones that are necessary for normal services, and not those required only for early boot and late shutdown, with very few exceptions. To list them here: `basic.target`, `shutdown.target`, `sockets.target`, `network.target`, `getty.target`, `graphical.target`, `multi-user.target`, `rescue.target`, `emergency.target`, `poweroff.target`, `reboot.target`, `halt.target`, `runlevel[1-5].target`. +* Some of the **"special" unit names** and their semantics. + To be precise the ones that are necessary for normal services, and not those required only for early boot and late shutdown, with very few exceptions. + To list them here: `basic.target`, `shutdown.target`, `sockets.target`, `network.target`, `getty.target`, `graphical.target`, `multi-user.target`, `rescue.target`, `emergency.target`, `poweroff.target`, `reboot.target`, `halt.target`, `runlevel[1-5].target`. -* **The D-Bus interfaces of the main service daemon and other daemons**. We try to always preserve backwards compatibility, and intentional breakage is never introduced. Nevertheless, when we find bugs that mean that the existing interface was not useful, or when the implementation did something different than stated by the documentation and the implemented behaviour is not useful, we will fix the implementation and thus introduce a change in behaviour. But the API (parameter counts and types) is never changed, and existing attributes and methods will not be removed. +* **The D-Bus interfaces of the main service daemon and other daemons**. We try to always preserve backwards compatibility, and intentional breakage is never introduced. + Nevertheless, when we find bugs that mean that the existing interface was not useful, or when the implementation did something different than stated by the documentation and the implemented behaviour is not useful, we will fix the implementation and thus introduce a change in behaviour. + But the API (parameter counts and types) is never changed, and existing attributes and methods will not be removed. * For a more comprehensive and authoritative list, consult the chart below. -The following interfaces will not necessarily be kept stable for now, but we will eventually make a stability promise for these interfaces too. In the meantime we will however try to keep breakage of these interfaces at a minimum: +The following interfaces will not necessarily be kept stable for now, but we will eventually make a stability promise for these interfaces too. +In the meantime we will however try to keep breakage of these interfaces at a minimum: * **The set of states of the various state machines used in systemd**, e.g. the high-level unit states inactive, active, deactivating, and so on, as well (and in particular) the low-level per-unit states. @@ -35,42 +49,64 @@ The following interfaces are considered private to systemd, and are not and will * **The internal protocols** used on the various sockets such as the sockets `/run/systemd/shutdown`, `/run/systemd/private`. -One of the main goals of systemd is to unify basic Linux configurations and service behaviors across all distributions. Systemd project does not contain any distribution-specific parts. Distributions are expected to convert over time their individual configurations to the systemd format, or they will need to carry and maintain patches in their package if they still decide to stay different. +One of the main goals of systemd is to unify basic Linux configurations and service behaviors across all distributions. +Systemd project does not contain any distribution-specific parts. +Distributions are expected to convert over time their individual configurations to the systemd format, or they will need to carry and maintain patches in their package if they still decide to stay different. -What does this mean for you? When developing with systemd, don't use any of the latter interfaces, or we will tell your mom, and she won't love you anymore. You are welcome to use the other interfaces listed here, but if you use any of the second kind (i.e. those where we don't yet make a stability promise), then make sure to subscribe to our mailing list, where we will announce API changes, and be prepared to update your program eventually. +What does this mean for you? When developing with systemd, don't use any of the latter interfaces, or we will tell your mom, and she won't love you anymore. +You are welcome to use the other interfaces listed here, but if you use any of the second kind (i.e. those where we don't yet make a stability promise), then make sure to subscribe to our mailing list, where we will announce API changes, and be prepared to update your program eventually. -Note that this is a promise, not an eternal guarantee. These are our intentions, but if in the future there are very good reasons to change or get rid of an interface we have listed above as stable, then we might take the liberty to do so, despite this promise. However, if we do this, then we'll do our best to provide a smooth and reasonably long transition phase. +Note that this is a promise, not an eternal guarantee. +These are our intentions, but if in the future there are very good reasons to change or get rid of an interface we have listed above as stable, then we might take the liberty to do so, despite this promise. +However, if we do this, then we'll do our best to provide a smooth and reasonably long transition phase. ## Interface Portability And Stability Chart -systemd provides a number of APIs to applications. Below you'll find a table detailing which APIs are considered stable and how portable they are. +systemd provides a number of APIs to applications. +Below you'll find a table detailing which APIs are considered stable and how portable they are. This list is intended to be useful for distribution and OS developers who are interested in maintaining a certain level of compatibility with the new interfaces systemd introduced, without relying on systemd itself. -In general it is our intention to cooperate through interfaces and not code with other distributions and OSes. That means that the interfaces where this applies are best reimplemented in a compatible fashion on those other operating systems. To make this easy we provide detailed interface documentation where necessary. That said, it's all Open Source, hence you have the option to a) fork our code and maintain portable versions of the parts you are interested in independently for your OS, or b) build systemd for your distro, but leave out all components except the ones you are interested in and run them without the core of systemd involved. We will try not to make this any more difficult than necessary. Patches to allow systemd code to be more portable will be accepted on case-by-case basis (essentially, patches to follow well-established standards instead of e.g. glibc or linux extensions have a very high chance of being accepted, while patches which make the code ugly or exist solely to work around bugs in other projects have a low chance of being accepted). +In general it is our intention to cooperate through interfaces and not code with other distributions and OSes. +That means that the interfaces where this applies are best reimplemented in a compatible fashion on those other operating systems. +To make this easy we provide detailed interface documentation where necessary. +That said, it's all Open Source, hence you have the option to a) fork our code and maintain portable versions of the parts you are interested in independently for your OS, or b) build systemd for your distro, but leave out all components except the ones you are interested in and run them without the core of systemd involved. +We will try not to make this any more difficult than necessary. +Patches to allow systemd code to be more portable will be accepted on case-by-case basis (essentially, patches to follow well-established standards instead of e.g. glibc or linux extensions have a very high chance of being accepted, while patches which make the code ugly or exist solely to work around bugs in other projects have a low chance of being accepted). -Many of these interfaces are already being used by applications and 3rd party code. If you are interested in compatibility with these applications, please consider supporting these interfaces in your distribution, where possible. +Many of these interfaces are already being used by applications and 3rd party code. +If you are interested in compatibility with these applications, please consider supporting these interfaces in your distribution, where possible. ## General Portability of systemd and its Components -**Portability to OSes:** systemd is not portable to non-Linux systems. It makes use of a large number of Linux-specific interfaces, including many that are used by its very core. We do not consider it feasible to port systemd to other Unixes (let alone non-Unix operating systems) and will not accept patches for systemd core implementing any such portability (but hey, it's git, so it's as easy as it can get to maintain your own fork...). APIs that are supposed to be used as library code are exempted from this: it is important to us that these compile nicely on non-Linux and even non-Unix platforms, even if they might just become NOPs. +**Portability to OSes:** systemd is not portable to non-Linux systems. +It makes use of a large number of Linux-specific interfaces, including many that are used by its very core. +We do not consider it feasible to port systemd to other Unixes (let alone non-Unix operating systems) and will not accept patches for systemd core implementing any such portability (but hey, it's git, so it's as easy as it can get to maintain your own fork...). +APIs that are supposed to be used as library code are exempted from this: it is important to us that these compile nicely on non-Linux and even non-Unix platforms, even if they might just become NOPs. -**Portability to Architectures:** It is important to us that systemd is portable to little endian as well as big endian systems. We will make sure to provide portability with all important architectures and hardware Linux runs on and are happy to accept patches for this. +**Portability to Architectures:** It is important to us that systemd is portable to little endian as well as big endian systems. +We will make sure to provide portability with all important architectures and hardware Linux runs on and are happy to accept patches for this. -**Portability to Distributions:** It is important to us that systemd is portable to all Linux distributions. However, the goal is to unify many of the needless differences between the distributions, and hence will not accept patches for certain distribution-specific work-arounds. Compatibility with the distribution's legacy should be maintained in the distribution's packaging, and not in the systemd source tree. +**Portability to Distributions:** It is important to us that systemd is portable to all Linux distributions. +However, the goal is to unify many of the needless differences between the distributions, and hence will not accept patches for certain distribution-specific work-arounds. +Compatibility with the distribution's legacy should be maintained in the distribution's packaging, and not in the systemd source tree. -**Compatibility with Specific Versions of Other packages:** We generally avoid adding compatibility kludges to systemd that work around bugs in certain versions of other software systemd interfaces with. We strongly encourage fixing bugs where they are, and if that's not systemd we rather not try to fix it there. (There are very few exceptions to this rule possible, and you need an exceptionally strong case for it). +**Compatibility with Specific Versions of Other packages:** We generally avoid adding compatibility kludges to systemd that work around bugs in certain versions of other software systemd interfaces with. We strongly encourage fixing bugs where they are, and if that's not systemd we rather not try to fix it there. +(There are very few exceptions to this rule possible, and you need an exceptionally strong case for it). ## General Portability of systemd's APIs -systemd's APIs are available everywhere where systemd is available. Some of the APIs we have defined are supposed to be generic enough to be implementable independently of systemd, thus allowing compatibility with systems systemd itself is not compatible with, i.e. other OSes, and distributions that are unwilling to fully adopt systemd. +systemd's APIs are available everywhere where systemd is available. +Some of the APIs we have defined are supposed to be generic enough to be implementable independently of systemd, thus allowing compatibility with systems systemd itself is not compatible with, i.e. other OSes, and distributions that are unwilling to fully adopt systemd. -A number of systemd's APIs expose Linux or systemd-specific features that cannot sensibly be implemented elsewhere. Please consult the table below for information about which ones these are. +A number of systemd's APIs expose Linux or systemd-specific features that cannot sensibly be implemented elsewhere. +Please consult the table below for information about which ones these are. -Note that not all of these interfaces are our invention (but most), we just adopted them in systemd to make them more prominently implemented. For example, we adopted many Debian facilities in systemd to push it into the other distributions as well. +Note that not all of these interfaces are our invention (but most), we just adopted them in systemd to make them more prominently implemented. +For example, we adopted many Debian facilities in systemd to push it into the other distributions as well. --- @@ -83,9 +119,9 @@ And now, here's the list of (hopefully) all APIs that we have introduced with sy | [hostnamed](https://www.freedesktop.org/software/systemd/man/org.freedesktop.hostname1.html) | D-Bus | yes | yes | GNOME | yes | [Ubuntu](https://launchpad.net/ubuntu/+source/ubuntu-system-service), [Gentoo](http://www.gentoo.org/proj/en/desktop/gnome/openrc-settingsd.xml), [BSD](http://uglyman.kremlin.cc/gitweb/gitweb.cgi?p=systembsd.git;a=summary) | partially | | [localed](https://www.freedesktop.org/software/systemd/man/org.freedesktop.locale1.html) | D-Bus | yes | yes | GNOME | yes | [Ubuntu](https://launchpad.net/ubuntu/+source/ubuntu-system-service), [Gentoo](http://www.gentoo.org/proj/en/desktop/gnome/openrc-settingsd.xml), [BSD](http://uglyman.kremlin.cc/gitweb/gitweb.cgi?p=systembsd.git;a=summary) | partially | | [timedated](https://www.freedesktop.org/software/systemd/man/org.freedesktop.timedate1.html) | D-Bus | yes | yes | GNOME | yes | [Gentoo](http://www.gentoo.org/proj/en/desktop/gnome/openrc-settingsd.xml), [BSD](http://uglyman.kremlin.cc/gitweb/gitweb.cgi?p=systembsd.git;a=summary) | partially | -| [initrd interface](INITRD_INTERFACE) | Environment, flag files | yes | yes | mkosi, dracut, ArchLinux | yes | ArchLinux | no | -| [Container interface](CONTAINER_INTERFACE) | Environment, Mounts | yes | yes | libvirt/LXC | yes | - | no | -| [Boot Loader interface](BOOT_LOADER_INTERFACE) | EFI variables | yes | yes | gummiboot | yes | - | no | +| [initrd interface](/INITRD_INTERFACE) | Environment, flag files | yes | yes | mkosi, dracut, ArchLinux | yes | ArchLinux | no | +| [Container interface](/CONTAINER_INTERFACE) | Environment, Mounts | yes | yes | libvirt/LXC | yes | - | no | +| [Boot Loader interface](/BOOT_LOADER_INTERFACE) | EFI variables | yes | yes | gummiboot | yes | - | no | | [Service bus API](https://www.freedesktop.org/software/systemd/man/org.freedesktop.systemd1.html) | D-Bus | yes | yes | system-config-services | no | - | no | | [logind](https://www.freedesktop.org/software/systemd/man/org.freedesktop.login1.html) | D-Bus | yes | yes | GNOME | no | - | no | | [sd-bus.h API](https://www.freedesktop.org/software/systemd/man/sd-bus.html) | C Library | yes | yes | - | maybe | - | maybe | @@ -102,15 +138,15 @@ And now, here's the list of (hopefully) all APIs that we have introduced with sy | [$XDG_RUNTIME_DIR](https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html) | Environment | yes | yes | glib, GNOME | yes | - | no | | [$LISTEN_FDS $LISTEN_PID FD Passing](https://www.freedesktop.org/software/systemd/man/sd_listen_fds.html) | Environment | yes | yes | numerous (via sd-daemon.h) | yes | - | no | | [$NOTIFY_SOCKET Daemon Notifications](https://www.freedesktop.org/software/systemd/man/sd_notify.html) | Environment | yes | yes | a few, including udev | yes | - | no | -| [argv[0][0]='@' Logic](ROOT_STORAGE_DAEMONS) | `/proc` marking | yes | yes | mdadm | yes | - | no | +| [argv[0][0]='@' Logic](/ROOT_STORAGE_DAEMONS) | `/proc` marking | yes | yes | mdadm | yes | - | no | | [Unit file format](https://www.freedesktop.org/software/systemd/man/systemd.unit.html) | File format | yes | yes | numerous | no | - | no | | [Network](https://www.freedesktop.org/software/systemd/man/systemd.network.html) & [Netdev file format](https://www.freedesktop.org/software/systemd/man/systemd.netdev.html) | File format | yes | yes | no | no | - | no | | [Link file format](https://www.freedesktop.org/software/systemd/man/systemd.link.html) | File format | yes | yes | no | no | - | no | -| [Journal File Format](JOURNAL_FILE_FORMAT) | File format | yes | yes | - | maybe | - | no | -| [Journal Export Format](JOURNAL_EXPORT_FORMATS.md#journal-export-format) | File format | yes | yes | - | yes | - | no | -| [Journal JSON Format](JOURNAL_EXPORT_FORMATS.md#journal-json-format) | File format | yes | yes | - | yes | - | no | -| [Cooperation in cgroup tree](https://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups) | Treaty | yes | yes | libvirt | yes | libvirt | no | -| [Password Agents](PASSWORD_AGENTS) | Socket+Files | yes | yes | - | yes | - | no | +| [Journal File Format](/JOURNAL_FILE_FORMAT) | File format | yes | yes | - | maybe | - | no | +| [Journal Export Format](JOURNAL_EXPORT_FORMATS#journal-export-format) | File format | yes | yes | - | yes | - | no | +| [Journal JSON Format](JOURNAL_EXPORT_FORMATS#journal-json-format) | File format | yes | yes | - | yes | - | no | +| [Cooperation in cgroup tree](/PAX_CONTROL_GROUPS) | Treaty | yes | yes | libvirt | yes | libvirt | no | +| [Password Agents](/PASSWORD_AGENTS) | Socket+Files | yes | yes | - | yes | - | no | | [udev multi-seat properties](https://www.freedesktop.org/software/systemd/man/sd-login.html) | udev Property | yes | yes | X11, gdm | no | - | no | | udev session switch ACL properties | udev Property | no | no | - | no | - | no | | [CLI of systemctl,...](https://www.freedesktop.org/software/systemd/man/systemctl.html) | CLI | yes | yes | numerous | no | - | no | @@ -135,15 +171,21 @@ And now, here's the list of (hopefully) all APIs that we have introduced with sy ### Explanations -Items for which "systemd implementation portable to other OSes" is "partially" means that it is possible to run the respective tools that are included in the systemd tarball outside of systemd. Note however that this is not officially supported, so you are more or less on your own if you do this. If you are opting for this solution simply build systemd as you normally would but drop all files except those which you are interested in. +Items for which "systemd implementation portable to other OSes" is "partially" means that it is possible to run the respective tools that are included in the systemd tarball outside of systemd. +Note however that this is not officially supported, so you are more or less on your own if you do this. +If you are opting for this solution simply build systemd as you normally would but drop all files except those which you are interested in. -Of course, it is our intention to eventually document all interfaces we defined. If we haven't documented them for now, this is usually because we want the flexibility to still change things, or don't want 3rd party applications to make use of these interfaces already. That said, our sources are quite readable and open source, so feel free to spelunk around in the sources if you want to know more. +Of course, it is our intention to eventually document all interfaces we defined. +If we haven't documented them for now, this is usually because we want the flexibility to still change things, or don't want 3rd party applications to make use of these interfaces already. +That said, our sources are quite readable and open source, so feel free to spelunk around in the sources if you want to know more. If you decide to reimplement one of the APIs for which "Reimplementable independently" is "no", then we won't stop you, but you are on your own. -This is not an attempt to comprehensively list all users of these APIs. We are just listing the most obvious/prominent ones which come to our mind. +This is not an attempt to comprehensively list all users of these APIs. +We are just listing the most obvious/prominent ones which come to our mind. -Of course, one last thing I can't make myself not ask you before we finish here, and before you start reimplementing these APIs in your distribution: are you sure it's time well spent if you work on reimplementing all this code instead of just spending it on adopting systemd on your distro as well? +Of course, one last thing I can't make myself not ask you before we finish here, and before you start reimplementing these APIs in your distribution: +are you sure it's time well spent if you work on reimplementing all this code instead of just spending it on adopting systemd on your distro as well? ## Independent Operation of systemd Programs diff --git a/docs/PREDICTABLE_INTERFACE_NAMES.md b/docs/PREDICTABLE_INTERFACE_NAMES.md index 9d79f8f..9fa9fea 100644 --- a/docs/PREDICTABLE_INTERFACE_NAMES.md +++ b/docs/PREDICTABLE_INTERFACE_NAMES.md @@ -7,24 +7,37 @@ SPDX-License-Identifier: LGPL-2.1-or-later # Predictable Network Interface Names -Starting with v197 systemd/udev will automatically assign predictable, stable network interface names for all local Ethernet, WLAN and WWAN interfaces. This is a departure from the traditional interface naming scheme (`eth0`, `eth1`, `wlan0`, ...), but should fix real problems. +Starting with v197 systemd/udev will automatically assign predictable, stable network interface names for all local Ethernet, WLAN and WWAN interfaces. +This is a departure from the traditional interface naming scheme (`eth0`, `eth1`, `wlan0`, ...), but should fix real problems. ## Why? -The classic naming scheme for network interfaces applied by the kernel is to simply assign names beginning with `eth0`, `eth1`, ... to all interfaces as they are probed by the drivers. As the driver probing is generally not predictable for modern technology this means that as soon as multiple network interfaces are available the assignment of the names `eth0`, `eth1` and so on is generally not fixed anymore and it might very well happen that `eth0` on one boot ends up being `eth1` on the next. This can have serious security implications, for example in firewall rules which are coded for certain naming schemes, and which are hence very sensitive to unpredictable changing names. +The classic naming scheme for network interfaces applied by the kernel is to simply assign names beginning with `eth0`, `eth1`, ... to all interfaces as they are probed by the drivers. +As the driver probing is generally not predictable for modern technology this means that as soon as multiple network interfaces are available the assignment of the names `eth0`, `eth1` and so on is generally not fixed anymore and it might very well happen that `eth0` on one boot ends up being `eth1` on the next. +This can have serious security implications, for example in firewall rules which are coded for certain naming schemes, and which are hence very sensitive to unpredictable changing names. -To fix this problem multiple solutions have been proposed and implemented. For a longer time udev shipped support for assigning permanent `ethX` names to certain interfaces based on their MAC addresses. This turned out to have a multitude of problems, among them: this required a writable root directory which is generally not available; the statelessness of the system is lost as booting an OS image on a system will result in changed configuration of the image; on many systems MAC addresses are not actually fixed, such as on a lot of embedded hardware and particularly on all kinds of virtualization solutions. The biggest of all however is that the userspace components trying to assign the interface name raced against the kernel assigning new names from the same `ethX` namespace, a race condition with all kinds of weird effects, among them that assignment of names sometimes failed. As a result support for this has been removed from systemd/udev a while back. +To fix this problem multiple solutions have been proposed and implemented. +For a longer time udev shipped support for assigning permanent `ethX` names to certain interfaces based on their MAC addresses. +This turned out to have a multitude of problems, among them: this required a writable root directory which is generally not available; the statelessness of the system is lost as booting an OS image on a system will result in changed configuration of the image; on many systems MAC addresses are not actually fixed, such as on a lot of embedded hardware and particularly on all kinds of virtualization solutions. +The biggest of all however is that the userspace components trying to assign the interface name raced against the kernel assigning new names from the same `ethX` namespace, a race condition with all kinds of weird effects, among them that assignment of names sometimes failed. +As a result support for this has been removed from systemd/udev a while back. -Another solution that has been implemented is `biosdevname` which tries to find fixed slot topology information in certain firmware interfaces and uses them to assign fixed names to interfaces which incorporate their physical location on the mainboard. In a way this naming scheme is similar to what is already done natively in udev for various device nodes via `/dev/*/by-path/` symlinks. In many cases, biosdevname departs from the low-level kernel device identification schemes that udev generally uses for these symlinks, and instead invents its own enumeration schemes. +Another solution that has been implemented is `biosdevname` which tries to find fixed slot topology information in certain firmware interfaces and uses them to assign fixed names to interfaces which incorporate their physical location on the mainboard. +In a way this naming scheme is similar to what is already done natively in udev for various device nodes via `/dev/*/by-path/` symlinks. +In many cases, biosdevname departs from the low-level kernel device identification schemes that udev generally uses for these symlinks, and instead invents its own enumeration schemes. -Finally, many distributions support renaming interfaces to user-chosen names (think: `internet0`, `dmz0`, ...) keyed off their MAC addresses or physical locations as part of their networking scripts. This is a very good choice but does have the problem that it implies that the user is willing and capable of choosing and assigning these names. +Finally, many distributions support renaming interfaces to user-chosen names (think: `internet0`, `dmz0`, ...) keyed off their MAC addresses or physical locations as part of their networking scripts. +This is a very good choice but does have the problem that it implies that the user is willing and capable of choosing and assigning these names. -We believe it is a good default choice to generalize the scheme pioneered by `biosdevname`. Assigning fixed names based on firmware/topology/location information has the big advantage that the names are fully automatic, fully predictable, that they stay fixed even if hardware is added or removed (i.e. no reenumeration takes place) and that broken hardware can be replaced seamlessly. That said, they admittedly are sometimes harder to read than the `eth0` or `wlan0` everybody is used to. Example: `enp5s0` +We believe it is a good default choice to generalize the scheme pioneered by `biosdevname`. +Assigning fixed names based on firmware/topology/location information has the big advantage that the names are fully automatic, fully predictable, that they stay fixed even if hardware is added or removed (i.e. no reenumeration takes place) and that broken hardware can be replaced seamlessly. +That said, they admittedly are sometimes harder to read than the `eth0` or `wlan0` everybody is used to. Example: `enp5s0` ## What precisely has changed in v197? -With systemd 197 we have added native support for a number of different naming policies into systemd/udevd proper and made a scheme similar to biosdevname's (but generally more powerful, and closer to kernel-internal device identification schemes) the default. The following different naming schemes for network interfaces are now supported by udev natively: +With systemd 197 we have added native support for a number of different naming policies into systemd/udevd proper and made a scheme similar to biosdevname's (but generally more powerful, and closer to kernel-internal device identification schemes) the default. +The following different naming schemes for network interfaces are now supported by udev natively: 1. Names incorporating Firmware/BIOS provided index numbers for on-board devices (example: `eno1`) 1. Names incorporating Firmware/BIOS provided PCI Express hotplug slot index numbers (example: `ens1`) @@ -32,9 +45,13 @@ With systemd 197 we have added native support for a number of different naming p 1. Names incorporating the interfaces's MAC address (example: `enx78e7d1ea46da`) 1. Classic, unpredictable kernel-native ethX naming (example: `eth0`) -By default, systemd v197 will now name interfaces following policy 1) if that information from the firmware is applicable and available, falling back to 2) if that information from the firmware is applicable and available, falling back to 3) if applicable, falling back to 5) in all other cases. Policy 4) is not used by default, but is available if the user chooses so. +By default, systemd v197 will now name interfaces following policy 1) if that information from the firmware is applicable and available, falling back to 2) if that information from the firmware is applicable and available, falling back to 3) if applicable, falling back to 5) in all other cases. +Policy 4) is not used by default, but is available if the user chooses so. -This combined policy is only applied as last resort. That means, if the system has biosdevname installed, it will take precedence. If the user has added udev rules which change the name of the kernel devices these will take precedence too. Also, any distribution specific naming schemes generally take precedence. +This combined policy is only applied as last resort. +That means, if the system has biosdevname installed, it will take precedence. +If the user has added udev rules which change the name of the kernel devices these will take precedence too. +Also, any distribution specific naming schemes generally take precedence. ## Come again, what good does this do? @@ -54,7 +71,9 @@ With this new scheme you now get: * The same on all distributions that adopted systemd/udev * It's easy to opt out of the scheme (see below) -Does this have any drawbacks? Yes, it does. Previously it was practically guaranteed that hosts equipped with a single ethernet card only had a single `eth0` interface. With this new scheme in place, an administrator now has to check first what the local interface name is before they can invoke commands on it, where previously they had a good chance that `eth0` was the right name. +Does this have any drawbacks? Yes, it does. +Previously it was practically guaranteed that hosts equipped with a single ethernet card only had a single `eth0` interface. +With this new scheme in place, an administrator now has to check first what the local interface name is before they can invoke commands on it, where previously they had a good chance that `eth0` was the right name. ## I don't like this, how do I disable this? @@ -67,4 +86,5 @@ You basically have three options: ## How does the new naming scheme look like, precisely? -That's documented in detail the [systemd.net-naming-scheme(7)](https://www.freedesktop.org/software/systemd/man/systemd.net-naming-scheme.html) man page. Please refer to this in case you are wondering how to decode the new interface names. +That's documented in detail the [systemd.net-naming-scheme(7)](https://www.freedesktop.org/software/systemd/man/systemd.net-naming-scheme.html) man page. +Please refer to this in case you are wondering how to decode the new interface names. diff --git a/docs/PRESET.md b/docs/PRESET.md index a2ae323..62aab96 100644 --- a/docs/PRESET.md +++ b/docs/PRESET.md @@ -9,21 +9,32 @@ SPDX-License-Identifier: LGPL-2.1-or-later ## Why? -Different **distributions** have different policies on which services shall be enabled by default when the package they are shipped in is installed. On Fedora all services stay off by default, so that installing a package will not cause a service to be enabled (with some exceptions). On Debian all services are immediately enabled by default, so that installing a package will cause its service(s) to be enabled right-away. +Different **distributions** have different policies on which services shall be enabled by default when the package they are shipped in is installed. +On Fedora all services stay off by default, so that installing a package will not cause a service to be enabled (with some exceptions). +On Debian all services are immediately enabled by default, so that installing a package will cause its service(s) to be enabled right-away. -Different **spins** (flavours, remixes, whatever you might want to call them) of a distribution also have different policies on what services to enable, and what services to leave off. For example, the Fedora default will enable gdm as display manager by default, while the Fedora KDE spin will enable kdm instead. +Different **spins** (flavours, remixes, whatever you might want to call them) of a distribution also have different policies on what services to enable, and what services to leave off. +For example, the Fedora default will enable gdm as display manager by default, while the Fedora KDE spin will enable kdm instead. -Different **sites** might also have different policies what to turn on by default and what to turn off. For example, one administrator would prefer to enforce the policy of "ssh should be always on, but everything else off", while another one might say "snmp always on, and for everything else use the distribution policy defaults". +Different **sites** might also have different policies what to turn on by default and what to turn off. +For example, one administrator would prefer to enforce the policy of "ssh should be always on, but everything else off", while another one might say "snmp always on, and for everything else use the distribution policy defaults". ## The Logic -Traditionally, policy about what services shall be enabled and what services shall not have been decided globally by the distributions, and were enforced in each package individually. This made it cumbersome to implement different policies per spin or per site, or to create software packages that do the right thing on more than one distribution. The enablement _mechanism_ was also encoding the enablement _policy_. +Traditionally, policy about what services shall be enabled and what services shall not have been decided globally by the distributions, and were enforced in each package individually. +This made it cumbersome to implement different policies per spin or per site, or to create software packages that do the right thing on more than one distribution. +The enablement _mechanism_ was also encoding the enablement _policy_. -systemd 32 and newer support package "preset" policies. These encode which units shall be enabled by default when they are installed, and which units shall not be enabled. +systemd 32 and newer support package "preset" policies. +These encode which units shall be enabled by default when they are installed, and which units shall not be enabled. -Preset files may be written for specific distributions, for specific spins or for specific sites, in order to enforce different policies as needed. Preset policies are stored in .preset files in /usr/lib/systemd/system-preset/. If no policy exists the default implied policy of "enable everything" is enforced, i.e. in Debian style. +Preset files may be written for specific distributions, for specific spins or for specific sites, in order to enforce different policies as needed. +Preset policies are stored in .preset files in /usr/lib/systemd/system-preset/. +If no policy exists the default implied policy of "enable everything" is enforced, i.e. in Debian style. -The policy encoded in preset files is applied to a unit by invoking "systemctl preset ". It is recommended to use this command in all package post installation scriptlets. "systemctl preset " is identical to "systemctl enable " resp. "systemctl disable " depending on the policy. +The policy encoded in preset files is applied to a unit by invoking "systemctl preset ". +It is recommended to use this command in all package post installation scriptlets. +"systemctl preset " is identical to "systemctl enable " resp. "systemctl disable " depending on the policy. Preset files allow clean separation of enablement mechanism (inside the package scriptlets, by invoking "systemctl preset"), and enablement policy (centralized in the preset files). @@ -39,6 +50,10 @@ Documentation for the recommended package scriptlets you find here: [http://www. For the preset logic to be useful, distributions need to implement a couple of steps: -- The default distribution policy needs to be encoded in a preset file /usr/lib/systemd/system-preset/99-default.preset or suchlike, unless the implied policy of "enable everything" is the right choice. For a Fedora-like policy of "enable nothing" it is sufficient to include the single line "disable" into that file. The default preset file should be installed as part of one the core packages of the distribution. +- The default distribution policy needs to be encoded in a preset file /usr/lib/systemd/system-preset/99-default.preset or suchlike, unless the implied policy of "enable everything" is the right choice. +For a Fedora-like policy of "enable nothing" it is sufficient to include the single line "disable" into that file. +The default preset file should be installed as part of one the core packages of the distribution. + - All packages need to be updated to use "systemctl preset" in the post install scriptlets. + - (Optionally) spins/remixes/flavours should define their own preset file, either overriding or extending the default distribution preset policy. Also see the fedora feature page: [https://fedoraproject.org/wiki/Features/PackagePresets](https://fedoraproject.org/wiki/Features/PackagePresets) diff --git a/docs/RANDOM_SEEDS.md b/docs/RANDOM_SEEDS.md index b2712ca..49d3329 100644 --- a/docs/RANDOM_SEEDS.md +++ b/docs/RANDOM_SEEDS.md @@ -12,81 +12,73 @@ random numbers from early boot on. ## Linux Kernel Entropy Pool -Today's computer systems require random number generators for numerous -cryptographic and other purposes. On Linux systems, the kernel's entropy pool -is typically used as high-quality source of random numbers. The kernel's -entropy pool combines various entropy inputs together, mixes them and provides -an API to userspace as well as to internal kernel subsystems to retrieve -it. This entropy pool needs to be initialized with a minimal level of entropy -before it can provide high quality, cryptographic random numbers to -applications. Until the entropy pool is fully initialized application requests -for high-quality random numbers cannot be fulfilled. +Today's computer systems require random number generators for numerous cryptographic and other purposes. +On Linux systems, the kernel's entropy pool is typically used as high-quality source of random numbers. The kernel's entropy pool combines various entropy inputs together, mixes them and provides +an API to userspace as well as to internal kernel subsystems to retrieve it. + +This entropy pool needs to be initialized with a minimal level of entropy +before it can provide high quality, cryptographic random numbers to applications. +Until the entropy pool is fully initialized application requests for high-quality random numbers cannot be fulfilled. The Linux kernel provides three relevant userspace APIs to request random data from the kernel's entropy pool: * The [`getrandom()`](https://man7.org/linux/man-pages/man2/getrandom.2.html) - system call with its `flags` parameter set to 0. If invoked, the calling - program will synchronously block until the random pool is fully initialized + system call with its `flags` parameter set to 0. + If invoked, the calling program will synchronously block until the random pool is fully initialized and the requested bytes can be provided. -* The `getrandom()` system call with its `flags` parameter set to - `GRND_NONBLOCK`. If invoked, the request for random bytes will fail if the - pool is not initialized yet. +* The `getrandom()` system call with its `flags` parameter set to `GRND_NONBLOCK`. + If invoked, the request for random bytes will fail if the pool is not initialized yet. * Reading from the [`/dev/urandom`](https://man7.org/linux/man-pages/man4/urandom.4.html) - pseudo-device will always return random bytes immediately, even if the pool - is not initialized. The provided random bytes will be of low quality in this - case however. Moreover, the kernel will log about all programs using this - interface in this state, and which thus potentially rely on an uninitialized - entropy pool. + pseudo-device will always return random bytes immediately, even if the pool is not initialized. + The provided random bytes will be of low quality in this case however. + Moreover, the kernel will log about all programs using this interface in this state, and which thus potentially rely on an uninitialized entropy pool. (Strictly speaking, there are more APIs, for example `/dev/random`, but these should not be used by almost any application and hence aren't mentioned here.) -Note that the time it takes to initialize the random pool may differ between -systems. If local hardware random number generators are available, -initialization is likely quick, but particularly in embedded and virtualized -environments available entropy is small and thus random pool initialization +Note that the time it takes to initialize the random pool may differ between systems. +If local hardware random number generators are available, initialization is likely quick, but particularly in embedded and virtualized environments available entropy is small and thus random pool initialization might take a long time (up to tens of minutes!). -Modern hardware tends to come with a number of hardware random number -generators (hwrng), that may be used to relatively quickly fill up the entropy -pool. Specifically: +Modern hardware tends to come with a number of hardware random number generators (hwrng), that may be used to relatively quickly fill up the entropy pool. +Specifically: * All recent Intel and AMD CPUs provide the CPU opcode - [RDRAND](https://en.wikipedia.org/wiki/RdRand) to acquire random bytes. Linux - includes random bytes generated this way in its entropy pool, but didn't use + [RDRAND](https://en.wikipedia.org/wiki/RdRand) to acquire random bytes. + Linux includes random bytes generated this way in its entropy pool, but didn't use to credit entropy for it (i.e. data from this source wasn't considered good - enough to consider the entropy pool properly filled even though it was - used). This has changed recently however, and most big distributions have - turned on the `CONFIG_RANDOM_TRUST_CPU=y` kernel compile time option. This - means systems with CPUs supporting this opcode will be able to very quickly + enough to consider the entropy pool properly filled even though it was used). + This has changed recently however, and most big distributions have + turned on the `CONFIG_RANDOM_TRUST_CPU=y` kernel compile time option. + This means systems with CPUs supporting this opcode will be able to very quickly reach the "pool filled" state. -* The TPM security chip that is available on all modern desktop systems has a - hwrng. It is also fed into the entropy pool, but generally not credited - entropy. You may use `rng_core.default_quality=1000` on the kernel command - line to change that, but note that this is a global setting affect all - hwrngs. (Yeah, that's weird.) +* The TPM security chip that is available on all modern desktop systems has a hwrng. + It is also fed into the entropy pool, but generally not credited entropy. + You may use `rng_core.default_quality=1000` on the kernel command line to change that, + but note that this is a global setting affect all hwrngs. + (Yeah, that's weird.) -* Many Intel and AMD chipsets have hwrng chips. Their Linux drivers usually - don't credit entropy. (But there's `rng_core.default_quality=1000`, see - above.) +* Many Intel and AMD chipsets have hwrng chips. + Their Linux drivers usually don't credit entropy. + (But there's `rng_core.default_quality=1000`, see above.) -* Various embedded boards have hwrng chips. Some drivers automatically credit - entropy, others do not. Some WiFi chips appear to have hwrng sources too, and +* Various embedded boards have hwrng chips. + Some drivers automatically credit entropy, others do not. + Some WiFi chips appear to have hwrng sources too, and they usually do not credit entropy for them. * `virtio-rng` is used in virtualized environments and retrieves random data from the VM host. It credits full entropy. -* The EFI firmware typically provides a RNG API. When transitioning from UEFI - to kernel mode Linux will query some random data through it, and feed it into - the pool, but not credit entropy to it. What kind of random source is behind - the EFI RNG API is often not entirely clear, but it hopefully is some kind of - hardware source. +* The EFI firmware typically provides a RNG API. + When transitioning from UEFI to kernel mode Linux will query some random data through it, and feed it into + the pool, but not credit entropy to it. + What kind of random source is behind the EFI RNG API is often not entirely clear, but it hopefully is some kind of hardware source. If neither of these are available (in fact, even if they are), Linux generates entropy from various non-hwrng sources in various subsystems, all of which @@ -95,31 +87,30 @@ particular in virtualized environments. ## `systemd`'s Use of Random Numbers -systemd is responsible for bringing up the OS. It generally runs as the first -userspace process the kernel invokes. Because of that it runs at a time where -the entropy pool is typically not yet initialized, and thus requests to acquire -random bytes will either be delayed, will fail or result in a noisy kernel log +systemd is responsible for bringing up the OS. +It generally runs as the first userspace process the kernel invokes. +Because of that it runs at a time where the entropy pool is typically not yet initialized, +and thus requests to acquire random bytes will either be delayed, will fail or result in a noisy kernel log message (see above). -Various other components run during early boot that require random bytes. For -example, initrds nowadays communicate with encrypted networks or access -encrypted storage which might need random numbers. systemd itself requires -random numbers as well, including for the following uses: +Various other components run during early boot that require random bytes. +For example, initrds nowadays communicate with encrypted networks or access +encrypted storage which might need random numbers. +systemd itself requires random numbers as well, including for the following uses: * systemd assigns 'invocation' UUIDs to all services it invokes that uniquely - identify each invocation. This is useful to retain a global handle on a specific - service invocation and relate it to other data. For example, log data - collected by the journal usually includes the invocation UUID and thus the - runtime context the service manager maintains can be neatly matched up with - the log data a specific service invocation generated. systemd also - initializes `/etc/machine-id` with a randomized UUID. (systemd also makes use - of the randomized "boot id" the kernel exposes in - `/proc/sys/kernel/random/boot_id`). These UUIDs are exclusively Type 4 UUIDs, - i.e. randomly generated ones. - -* systemd maintains various hash tables internally. In order to harden them - against [collision - attacks](https://www.cs.auckland.ac.nz/~mcw/Teaching/refs/misc/denial-of-service.pdf) + identify each invocation. + This is useful to retain a global handle on a specific service invocation and relate it to other data. + For example, log data collected by the journal usually includes the invocation UUID + and thus the runtime context the service manager maintains can be neatly matched up with + the log data a specific service invocation generated. + systemd also initializes `/etc/machine-id` with a randomized UUID. + (systemd also makes use of the randomized "boot id" the kernel exposes in `/proc/sys/kernel/random boot_id`). + These UUIDs are exclusively Type 4 UUIDs, i.e. randomly generated ones. + +* systemd maintains various hash tables internally. + In order to harden them against + [collision attacks](https://www.cs.auckland.ac.nz/~mcw/Teaching/refs/misc/denial-of-service.pdf) they are seeded with random numbers. * At various places systemd needs random bytes for temporary file name @@ -133,20 +124,18 @@ random numbers as well, including for the following uses: Note that these cases generally do not require a cryptographic-grade random number generator, as most of these utilize random numbers to minimize risk of -collision and not to generate secret key material. However, they usually do -require "medium-grade" random data. For example: systemd's hash-maps are -reseeded if they grow beyond certain thresholds (and thus collisions are more -likely). This means they are generally fine with low-quality (even constant) -random numbers initially as long as they get better with time, so that -collision attacks are eventually thwarted as better, non-guessable seeds are +collision and not to generate secret key material. +However, they usually do require "medium-grade" random data. +For example: systemd's hash-maps are reseeded if they grow beyond certain thresholds (and thus collisions are more likely). +This means they are generally fine with low-quality (even constant)random numbers initially as long as they get better with time, so that collision attacks are eventually thwarted as better, non-guessable seeds are acquired. ## Keeping `systemd'`s Demand on the Kernel Entropy Pool Minimal Since most of systemd's own use of random numbers do not require cryptographic-grade RNGs, it tries to avoid blocking reads to the kernel's RNG, -opting instead for using `getrandom(GRND_INSECURE)`. After the pool is -initialized, this is identical to `getrandom(0)`, returning cryptographically +opting instead for using `getrandom(GRND_INSECURE)`. +After the pool is initialized, this is identical to `getrandom(0)`, returning cryptographically secure random numbers, but before it's initialized it has the nice effect of not blocking system boot. @@ -158,51 +147,47 @@ boot, in order to ensure the entropy pool is filled up quickly. 1. When systemd's PID 1 detects it runs in a virtualized environment providing the `virtio-rng` interface it will load the necessary kernel modules to make use of it during earliest boot, if possible — much earlier than regular - kernel module loading done by `systemd-udevd.service`. This should ensure - that in VM environments the entropy pool is quickly filled, even before + kernel module loading done by `systemd-udevd.service`. + This should ensure that in VM environments the entropy pool is quickly filled, even before systemd invokes the first service process — as long as the VM environment provides virtualized RNG hardware (and VM environments really should!). 2. The [`systemd-random-seed.service`](https://www.freedesktop.org/software/systemd/man/systemd-random-seed.service.html) system service will load a random seed from `/var/lib/systemd/random-seed` - into the kernel entropy pool. By default it does not credit entropy for it - though, since the seed is — more often than not — not reset when 'golden' - master images of an OS are created, and thus replicated into every - installation. If OS image builders carefully reset the random seed file - before generating the image it should be safe to credit entropy, which can - be enabled by setting the `$SYSTEMD_RANDOM_SEED_CREDIT` environment variable - for the service to `1` (or even `force`, see man page). Note however, that - this service typically runs relatively late during early boot: long after - the initrd completed, and after the `/var/` file system became - writable. This is usually too late for many applications, it is hence not - advised to rely exclusively on this functionality to seed the kernel's - entropy pool. Also note that this service synchronously waits until the - kernel's entropy pool is initialized before completing start-up. It may thus - be used by other services as synchronization point to order against, if they + into the kernel entropy pool. + By default it does not credit entropy for it though, since the seed is — more often than not — not reset when 'golden' master images of an OS are created, and thus replicated into every installation. + If OS image builders carefully reset the random seed file before generating the image it should be safe to credit entropy, which can be enabled by setting the `$SYSTEMD_RANDOM_SEED_CREDIT` environment variable + for the service to `1` (or even `force`, see man page). + Note however, that this service typically runs relatively late during early boot: long after + the initrd completed, and after the `/var/` file system became writable. + This is usually too late for many applications, it is hence not advised to rely exclusively on this functionality to seed the kernel's entropy pool. + Also note that this service synchronously waits until the kernel's entropy pool is initialized before completing start-up. + It may thus be used by other services as synchronization point to order against, if they require an initialized entropy pool to operate correctly. 3. The [`systemd-boot`](https://www.freedesktop.org/software/systemd/man/systemd-boot.html) EFI boot loader included in systemd is able to maintain and provide a random seed stored in the EFI System Partition (ESP) to the booted OS, which allows - booting up with a fully initialized entropy pool from earliest boot - on. During installation of the boot loader (or when invoking [`bootctl - random-seed`](https://www.freedesktop.org/software/systemd/man/bootctl.html#random-seed)) - a seed file with an initial seed is placed in a file `/loader/random-seed` - in the ESP. In addition, an identically sized randomized EFI variable called - the 'system token' is set, which is written to the machine's firmware NVRAM. + booting up with a fully initialized entropy pool from earliest boot on. + During installation of the boot loader (or when invoking + [`bootctlrandom-seed`](https://www.freedesktop.org/software/systemd/man/bootctl.html#random-seed)) + a seed file with an initial seed is placed in a file `/loader/random-seed` in the ESP. + In addition, an identically sized randomized EFI variable called the 'system token' is set, which is written to the machine's firmware NVRAM. + During boot, when `systemd-boot` finds both the random seed file and the system token they are combined and hashed with SHA256 (in counter mode, to generate sufficient data), to generate a new random seed file to store in - the ESP as well as a random seed to pass to the OS kernel. The new random - seed file for the ESP is then written to the ESP, ensuring this is completed + the ESP as well as a random seed to pass to the OS kernel. + The new random seed file for the ESP is then written to the ESP, ensuring this is completed before the OS is invoked. The kernel then reads the random seed that the boot loader passes to it, via the EFI configuration table entry, `LINUX_EFI_RANDOM_SEED_TABLE_GUID` (1ce1e5bc-7ceb-42f2-81e5-8aadf180f57b), which is allocated with pool memory - of type `EfiACPIReclaimMemory`. Its contents have the form: + of type `EfiACPIReclaimMemory`. + Its contents have the form: ``` struct linux_efi_random_seed { u32 size; // of the 'seed' array in bytes @@ -216,17 +201,16 @@ boot, in order to ensure the entropy pool is filled up quickly. This mechanism is able to safely provide an initialized entropy pool before userspace even starts and guarantees that different seeds are passed from the boot loader to the OS on every boot (in a way that does not allow - regeneration of an old seed file from a new seed file). Moreover, when an OS - image is replicated between multiple images and the random seed is not - reset, this will still result in different random seeds being passed to the - OS, as the per-machine 'system token' is specific to the physical host, and - not included in OS disk images. If the 'system token' is properly - initialized and kept sufficiently secret it should not be possible to + regeneration of an old seed file from a new seed file). + + Moreover, when an OS image is replicated between multiple images and the random seed is not reset, this will still result in different random seeds being passed to the OS, as the per-machine 'system token' is specific to the physical host, and not included in OS disk images. + + If the 'system token' is properly initialized and kept sufficiently secret it should not be possible to regenerate the entropy pool of different machines, even if this seed is the only source of entropy. - Note that the writes to the ESP needed to maintain the random seed should be - minimal. Because the size of the random seed file is generally set to 32 bytes, + Note that the writes to the ESP needed to maintain the random seed should be minimal. + Because the size of the random seed file is generally set to 32 bytes, updating the random seed in the ESP should be doable safely with a single sector write (since hard-disk sectors typically happen to be 512 bytes long, too), which should be safe even with FAT file system drivers built into @@ -234,10 +218,10 @@ boot, in order to ensure the entropy pool is filled up quickly. 4. A kernel command line option `systemd.random_seed=` may be used to pass in a base64 encoded seed to initialize the kernel's entropy pool from during - early service manager initialization. This option is only safe in testing - environments, as the random seed passed this way is accessible to - unprivileged programs via `/proc/cmdline`. Using this option outside of - testing environments is a security problem since cryptographic key material + early service manager initialization. + This option is only safe in testing environments, as the random seed passed this way is accessible to + unprivileged programs via `/proc/cmdline`. + Using this option outside of testing environments is a security problem since cryptographic key material derived from the entropy pool initialized with a seed accessible to unprivileged programs should not be considered secret. @@ -250,9 +234,9 @@ early-boot entropy in most cases. Specifically: 2. On virtualized systems, the early `virtio-rng` hookup should ensure entropy is available early on — as long as the VM environment provides virtualized - RNG devices, which they really should all do in 2019. Complain to your - hosting provider if they don't. For VMs used in testing environments, - `systemd.random_seed=` may be used as an alternative to a virtualized RNG. + RNG devices, which they really should all do in 2019. + Complain to your hosting provider if they don't. + For VMs used in testing environments, `systemd.random_seed=` may be used as an alternative to a virtualized RNG. 3. In general, systemd's own reliance on the kernel entropy pool is minimal (due to the use of `GRND_INSECURE`). @@ -263,20 +247,20 @@ early-boot entropy in most cases. Specifically: This primarily leaves two kind of systems in the cold: 1. Some embedded systems. Many embedded chipsets have hwrng functionality these - days. Consider using them while crediting - entropy. (i.e. `rng_core.default_quality=1000` on the kernel command line is - your friend). Or accept that the system might take a bit longer to - boot. Alternatively, consider implementing a solution similar to - systemd-boot's random seed concept in your platform's boot loader. + days. + Consider using them while crediting entropy. + (i.e. `rng_core.default_quality=1000` on the kernel command line is your friend). + Or accept that the system might take a bit longer to boot. + Alternatively, consider implementing a solution similar to systemd-boot's random seed concept in your platform's boot loader. 2. Virtualized environments that lack both virtio-rng and RDRAND, outside of - test environments. Tough luck. Talk to your hosting provider, and ask them - to fix this. + test environments. + Tough luck. Talk to your hosting provider, and ask them to fix this. 3. Also note: if you deploy an image without any random seed and/or without installing any 'system token' in an EFI variable, as described above, this - means that on the first boot no seed can be passed to the OS - either. However, as the boot completes (with entropy acquired elsewhere), + means that on the first boot no seed can be passed to the OS either. + However, as the boot completes (with entropy acquired elsewhere), systemd will automatically install both a random seed in the GPT and a 'system token' in the EFI variable space, so that any future boots will have entropy from earliest boot on — all provided `systemd-boot` is used. @@ -286,12 +270,13 @@ This primarily leaves two kind of systems in the cold: 1. *Why don't you just use getrandom()? That's all you need!* Did you read any of the above? getrandom() is hooked to the kernel entropy - pool, and during early boot it's not going to be filled yet, very likely. We - do use it in many cases, but not in all. Please read the above again! + pool, and during early boot it's not going to be filled yet, very likely. + We do use it in many cases, but not in all. + Please read the above again! 2. *Why don't you use - [getentropy()](https://man7.org/linux/man-pages/man3/getentropy.3.html)? That's - all you need!* + [getentropy()](https://man7.org/linux/man-pages/man3/getentropy.3.html)? + That's all you need!* Same story. That call is just a different name for `getrandom()` with `flags` set to zero, and some additional limitations, and thus it also needs @@ -299,16 +284,14 @@ This primarily leaves two kind of systems in the cold: are trying to address here. 3. *Why don't you generate your UUIDs with - [`uuidd`](https://man7.org/linux/man-pages/man8/uuidd.8.html)? That's all you - need!* + [`uuidd`](https://man7.org/linux/man-pages/man8/uuidd.8.html)? + That's all you need!* First of all, that's a system service, i.e. something that runs as "payload" of systemd, long after systemd is already up and hence can't provide us - UUIDs during earliest boot yet. Don't forget: to assign the invocation UUID - for the `uuidd.service` start we already need a UUID that the service is - supposed to provide us. More importantly though, `uuidd` needs state/a random - seed/a MAC address/host ID to operate, all of which are not available during - early boot. + UUIDs during earliest boot yet. + Don't forget: to assign the invocation UUID for the `uuidd.service` start we already need a UUID that the service is supposed to provide us. + More importantly though, `uuidd` needs state/a random seed/a MAC address/host ID to operate, all of which are not available during early boot. 4. *Why don't you generate your UUIDs with `/proc/sys/kernel/random/uuid`? That's all you need!* @@ -316,19 +299,21 @@ This primarily leaves two kind of systems in the cold: This is just a different, more limited interface to `/dev/urandom`. It gains us nothing. -5. *Why don't you use [`rngd`](https://github.com/nhorman/rng-tools), +5. *Why don't you use + [`rngd`](https://github.com/nhorman/rng-tools), [`haveged`](http://www.issihosts.com/haveged/), - [`egd`](http://egd.sourceforge.net/)? That's all you need!* + [`egd`](http://egd.sourceforge.net/)? + That's all you need!* - Like `uuidd` above these are system services, hence come too late for our - use-case. In addition much of what `rngd` provides appears to be equivalent + Like `uuidd` above these are system services, hence come too late for our use-case. + In addition much of what `rngd` provides appears to be equivalent to `CONFIG_RANDOM_TRUST_CPU=y` or `rng_core.default_quality=1000`, except - being more complex and involving userspace. These services partly measure - system behavior (such as scheduling effects) which the kernel either + being more complex and involving userspace. + These services partly measure system behavior (such as scheduling effects) which the kernel either already feeds into its pool anyway (and thus shouldn't be fed into it a second time, crediting entropy for it a second time) or is at least - something the kernel could much better do on its own. Hence, if what these - daemons do is still desirable today, this would be much better implemented + something the kernel could much better do on its own. + Hence, if what these daemons do is still desirable today, this would be much better implemented in kernel (which would be very welcome of course, but wouldn't really help us here in our specific problem, see above). @@ -337,29 +322,27 @@ This primarily leaves two kind of systems in the cold: This doesn't solve the issue, since it requires a nonce to start from, and it gets that from `getrandom()`, and thus we have to wait for random pool - initialization the same way as calling `getrandom()` - directly. `arc4random()` is nothing more than optimization, in fact it + initialization the same way as calling `getrandom()` directly. + + `arc4random()` is nothing more than optimization, in fact it implements similar algorithms that the kernel entropy pool implements anyway, hence besides being able to provide random bytes with higher - throughput there's little it gets us over just using `getrandom()`. Also, - it's not supported by glibc. And as long as that's the case we are not keen - on using it, as we'd have to maintain that on our own, and we don't want to - maintain our own cryptographic primitives if we don't have to. Since - systemd's uses are not performance relevant (besides the pool initialization - delay, which this doesn't solve), there's hence little benefit for us to - call these functions. That said, if glibc learns these APIs one day, we'll - certainly make use of them where appropriate. - -7. *This is boring: NetBSD had [boot loader entropy seed - support](https://netbsd.gw.com/cgi-bin/man-cgi?boot+8) since ages!* + throughput there's little it gets us over just using `getrandom()`. + + Also, it's not supported by glibc. + And as long as that's the case we are not keen on using it, as we'd have to maintain that on our own, and we don't want to maintain our own cryptographic primitives if we don't have to. + Since systemd's uses are not performance relevant (besides the pool initialization + delay, which this doesn't solve), there's hence little benefit for us to call these functions. + That said, if glibc learns these APIs one day, we'll certainly make use of them where appropriate. + +7. *This is boring: NetBSD had [boot loader entropy seed support](https://man.netbsd.org/entropy.7) since ages!* Yes, NetBSD has that, and the above is inspired by that (note though: this - article is about a lot more than that). NetBSD's support is not really safe, - since it neither updates the random seed before using it, nor has any - safeguards against replicating the same disk image with its random seed on - multiple machines (which the 'system token' mentioned above is supposed to - address). This means reuse of the same random seed by the boot loader is - much more likely. + article is about a lot more than that). + NetBSD's support is not really safe, since it neither updates the random seed before using it, + nor has any safeguards against replicating the same disk image with its random seed on + multiple machines (which the 'system token' mentioned above is supposed to address). + This means reuse of the same random seed by the boot loader is much more likely. 8. *Why does PID 1 upload the boot loader provided random seed into kernel instead of kernel doing that on its own?* @@ -370,11 +353,10 @@ This primarily leaves two kind of systems in the cold: 9. *What about non-EFI?* The boot loader random seed logic described above uses EFI variables to pass - the seed from the boot loader to the OS. Other systems might have similar - functionality though, and it shouldn't be too hard to implement something - similar for them. Ideally, we'd have an official way to pass such a seed as - part of the `struct boot_params` from the boot loader to the kernel, but - this is currently not available. + the seed from the boot loader to the OS. + Other systems might have similar functionality though, and it shouldn't be too hard to implement something + similar for them. + Ideally, we'd have an official way to pass such a seed as part of the `struct boot_params` from the boot loader to the kernel, but this is currently not available. 10. *I use a different boot loader than `systemd-boot`, I'd like to use boot loader random seeds too!* @@ -384,25 +366,23 @@ This primarily leaves two kind of systems in the cold: for an introduction why. That said, any boot loader can re-implement the logic described above, and can pass a random seed that systemd as PID 1 will then upload into the kernel's entropy pool. For details see the - [Boot Loader Interface](BOOT_LOADER_INTERFACE) documentation. + [Boot Loader Interface](/BOOT_LOADER_INTERFACE) documentation. 11. *Why not pass the boot loader random seed via kernel command line instead of as EFI variable?* - The kernel command line is accessible to unprivileged processes via - `/proc/cmdline`. It's not desirable if unprivileged processes can use this - information to possibly gain too much information about the current state - of the kernel's entropy pool. + The kernel command line is accessible to unprivileged processes via `/proc/cmdline`. + It's not desirable if unprivileged processes can use this information to possibly gain too much information about the current state of the kernel's entropy pool. That said, we actually do implement this with the `systemd.random_seed=` - kernel command line option. Don't use this outside of testing environments, - however, for the aforementioned reasons. + kernel command line option. + Don't use this outside of testing environments, however, for the aforementioned reasons. 12. *Why doesn't `systemd-boot` rewrite the 'system token' too each time when updating the random seed file stored in the ESP?* - The system token is stored as persistent EFI variable, i.e. in some form of - NVRAM. These memory chips tend be of low quality in many machines, and - hence we shouldn't write them too often. Writing them once during - installation should generally be OK, but rewriting them on every single + The system token is stored as persistent EFI variable, i.e. in some form of NVRAM. + These memory chips tend be of low quality in many machines, and + hence we shouldn't write them too often. + Writing them once during installation should generally be OK, but rewriting them on every single boot would probably wear the chip out too much, and we shouldn't risk that. diff --git a/docs/ROOT_STORAGE_DAEMONS.md b/docs/ROOT_STORAGE_DAEMONS.md index 69812c9..41fc602 100644 --- a/docs/ROOT_STORAGE_DAEMONS.md +++ b/docs/ROOT_STORAGE_DAEMONS.md @@ -106,7 +106,7 @@ to find a different solution to your problem._ The recommended way to distinguish between run-from-initrd and run-from-rootfs for a daemon is to check for `/etc/initrd-release` (which exists on all modern -initrd implementations, see the [initrd Interface](INITRD_INTERFACE) for +initrd implementations, see the [initrd Interface](/INITRD_INTERFACE) for details) which when exists results in `argv[0][0]` being set to `@`, and otherwise doesn't. Something like this: @@ -191,4 +191,4 @@ few additional notes for supporting these setups: program consult this blog story: [Socket Activation](https://0pointer.de/blog/projects/socket-activation.html) -* Consider having a look at the [initrd Interface of systemd](INITRD_INTERFACE). +* Consider having a look at the [initrd Interface of systemd](/INITRD_INTERFACE). diff --git a/docs/SECURITY.md b/docs/SECURITY.md index a44b90d..f9f2e91 100644 --- a/docs/SECURITY.md +++ b/docs/SECURITY.md @@ -7,8 +7,12 @@ SPDX-License-Identifier: LGPL-2.1-or-later # Reporting of Security Vulnerabilities -If you discover a security vulnerability, we'd appreciate a non-public disclosure. systemd developers can be contacted privately on the **[systemd-security@redhat.com](mailto:systemd-security@redhat.com) mailing list**. The disclosure will be coordinated with distributions. +If you discover a security vulnerability, we'd appreciate a non-public disclosure. +systemd developers can be contacted privately on the **[systemd-security@redhat.com](mailto:systemd-security@redhat.com) mailing list**. +The disclosure will be coordinated with distributions. (The [issue tracker](https://github.com/systemd/systemd/issues) and [systemd-devel mailing list](https://lists.freedesktop.org/mailman/listinfo/systemd-devel) are fully public.) -Subscription to the systemd-security mailing list is open to **regular systemd contributors and people working in the security teams of various distributions**. Those conditions should be backed by publicly accessible information (ideally, a track of posts and commits from the mail address in question). If you fall into one of those categories and wish to be subscribed, submit a **[subscription request](https://www.redhat.com/mailman/listinfo/systemd-security)**. +Subscription to the systemd-security mailing list is open to **regular systemd contributors and people working in the security teams of various distributions**. +Those conditions should be backed by publicly accessible information (ideally, a track of posts and commits from the mail address in question). +If you fall into one of those categories and wish to be subscribed, submit a **[subscription request](https://www.redhat.com/mailman/listinfo/systemd-security)**. diff --git a/docs/SEPARATE_USR_IS_BROKEN.md b/docs/SEPARATE_USR_IS_BROKEN.md index 8e9390e..5612945 100644 --- a/docs/SEPARATE_USR_IS_BROKEN.md +++ b/docs/SEPARATE_USR_IS_BROKEN.md @@ -7,34 +7,83 @@ SPDX-License-Identifier: LGPL-2.1-or-later # Booting Without /usr is Broken -You probably discovered this page because your shiny new systemd system referred you here during boot time, when it warned you that booting without /usr pre-mounted wasn't supported anymore. And now you wonder what this all is about. Here's an attempt of an explanation: +You probably discovered this page because your shiny new systemd system referred you here during boot time, +when it warned you that booting without `/usr` pre-mounted wasn't supported anymore. +And now you wonder what this all is about. +Here's an attempt of an explanation: -One thing in advance: systemd itself is actually mostly fine with /usr on a separate file system that is not pre-mounted at boot time. However, the common basic set of OS components of modern Linux machines is not, and has not been in quite some time. And it is unlikely that this is going to be fixed any time soon, or even ever. +One thing in advance: +systemd itself is actually mostly fine with `/usr` on a separate file system that is not pre-mounted at boot time. +However, the common basic set of OS components of modern Linux machines is not, and has not been in quite some time. +And it is unlikely that this is going to be fixed any time soon, or even ever. -Most of the failures you will experience with /usr split off and not pre-mounted in the initramfs are graceful failures: they won't become directly visible, however certain features become unavailable due to these failures. Quite a number of programs these days hook themselves into the early boot process at various stages. A popular way to do this is for example via udev rules. The binaries called from these rules are sometimes located on /usr/bin, or link against libraries in /usr/lib, or use data files from /usr/share. If these rules fail udev will proceed with the next one, however later on applications will then not properly detect these udev devices or features of these devices. Here's a short, very in-comprehensive list of software we are aware of that currently are not able to provide the full set of functionality when /usr is split off and not pre-mounted at boot: udev-pci-db/udev-usb-db and all rules depending on this (using the PCI/USB database in /usr/share), PulseAudio, NetworkManager, ModemManager, udisks, libatasmart, usb\_modeswitch, gnome-color-manager, usbmuxd, ALSA, D-Bus, CUPS, Plymouth, LVM, hplip, multipath, Argyll, VMWare, the locale logic of most programs and a lot of other stuff. +Most of the failures you will experience with `/usr` split off and not pre-mounted in the initramfs are graceful failures: +they won't become directly visible, however certain features become unavailable due to these failures. +Quite a number of programs these days hook themselves into the early boot process at various stages. +A popular way to do this is for example via udev rules. +The binaries called from these rules are sometimes located on `/usr/bin`, or link against libraries in `/usr/lib`, +or use data files from `/usr/share`. +If these rules fail udev will proceed with the next one, +however later on applications will then not properly detect these udev devices or features of these devices. +Here's a short, very in-comprehensive list of software we are aware of that currently are not able to provide the full set of functionality when `/usr` is split off and not pre-mounted at boot: +udev-pci-db/udev-usb-db and all rules depending on this +(using the PCI/USB database in `/usr/share`), +PulseAudio, NetworkManager, ModemManager, udisks, libatasmart, usb\_modeswitch, +gnome-color-manager, usbmuxd, ALSA, D-Bus, CUPS, Plymouth, LVM, hplip, multipath, Argyll, VMWare, +the locale logic of most programs and a lot of other stuff. -You don't believe us? Well, here's a command line that reveals a few obvious cases of udev rules that will silently fail to work if /usr is split off and not pre-mounted: `egrep 'usb-db|pci-db|FROM_DATABASE|/usr' /*/udev/rules.d/*` -- and you find a lot more if you actually look for it. On my fresh Fedora 15 install that's 23 obvious cases. +You don't believe us? +Well, here's a command line that reveals a few obvious cases of udev rules that will silently fail to work if `/usr` is split off and not pre-mounted: +`egrep 'usb-db|pci-db|FROM_DATABASE|/usr' /*/udev/rules.d/*` +-- and you find a lot more if you actually look for it. +On my fresh Fedora 15 install that's 23 obvious cases. ## The Status Quo -Due to this, many upstream developers have decided to consider the problem of a separate /usr that is not mounted during early boot an outdated question, and started to close bugs regarding these issues as WONTFIX. We certainly cannot blame them, as the benefit of supporting this is questionable and brings a lot of additional work with it. +Due to this, many upstream developers have decided to consider the problem of a separate +`/usr` that is not mounted during early boot an outdated question, +and started to close bugs regarding these issues as WONTFIX. +We certainly cannot blame them, as the benefit of supporting this is questionable and brings a lot of additional work with it. And let's clarify a few things: -1. **It isn't systemd's fault.** systemd mostly works fine with /usr on a separate file system that is not pre-mounted at boot. +1. **It isn't systemd's fault.** systemd mostly works fine with `/usr` on a separate file system that is not pre-mounted at boot. 2. **systemd is merely the messenger.** Don't shoot the messenger. -3. **There's no news in all of this.** The message you saw is just a statement of fact, describing the status quo. Things have been this way since a while. +3. **There's no news in all of this.** The message you saw is just a statement of fact, describing the status quo. + Things have been this way since a while. 4. **The message is merely a warning.** You can choose to ignore it. -5. **Don't blame us**, don't abuse us, it's not our fault. We have been working on the Linux userspace since quite some time, and simply have enough of the constant bug reports regarding these issues, since they are actually very hard to track down because the failures are mostly graceful. Hence we placed this warning into the early boot process of every systemd Linux system with a split off and not pre-mounted /usr, so that people understand what is going on. +5. **Don't blame us**, don't abuse us, it's not our fault. +We have been working on the Linux userspace since quite some time, +and simply have enough of the constant bug reports regarding these issues, +since they are actually very hard to track down because the failures are mostly graceful. +Hence we placed this warning into the early boot process of every systemd Linux system with a split off and not pre-mounted +`/usr`, so that people understand what is going on. ## Going Forward -/usr on its own filesystem is useful in some custom setups. But instead of expecting the traditional Unix way to (sometimes mindlessly) distributing tools between /usr and /, and require more and more tools to move to /, we now just expect /usr to be pre-mounted from inside the initramfs, to be available before 'init' starts. The duty of the minimal boot system that consisted of /bin, /sbin and /lib on traditional Unix, has been taken over by the initramfs of modern Linux. An initramfs that supports mounting /usr on top of / before it starts 'init', makes all existing setups work properly. +`/usr` on its own filesystem is useful in some custom setups. +But instead of expecting the traditional Unix way to (sometimes mindlessly) distributing tools between `/usr` and `/`, +and require more and more tools to move to `/`, +we now just expect `/usr` to be pre-mounted from inside the initramfs, to be available before 'init' starts. +The duty of the minimal boot system that consisted of `/bin`, `/sbin` and `/lib` on traditional Unix, +has been taken over by the initramfs of modern Linux. +An initramfs that supports mounting `/usr` on top of `/` before it starts 'init', makes all existing setups work properly. -There is no way to reliably bring up a modern system with an empty /usr. There are two alternatives to fix it: move /usr back to the rootfs or use an initramfs which can hide the split-off from the system. +There is no way to reliably bring up a modern system with an empty `/usr`. +There are two alternatives to fix it: move `/usr` back to the rootfs or use an initramfs which can hide the split-off from the system. -On the Fedora distribution we have succeeded to clean up the situation and the confusion the current split between / and /usr has created. We have moved all tools that over time have been moved to / back to /usr (where they belong), and the root file system only contains compatibility symlinks for /bin and /sbin into /usr. All binaries of the system are exclusively located within the /usr hierarchy. +On the Fedora distribution we have succeeded to clean up the situation and the confusion the current split between `/` and `/usr` has created. +We have moved all tools that over time have been moved to `/` back to `/usr` (where they belong), +and the root file system only contains compatibility symlinks for `/bin` and `/sbin` into `/usr`. +All binaries of the system are exclusively located within the `/usr` hierarchy. -In this new definition of /usr, the directory can be mounted read-only by default, while the rootfs may be either read-write or read-only (for stateless systems) and contains only the empty mount point directories, compat-symlinks to /usr and the host-specific data like /etc, /root, /srv. In comparison to today's setups, the rootfs will be very small. The host-specific data will be properly separated from the installed operating system. The new /usr could also easily be shared read-only across several systems. Such a setup would be more efficient, can provide additional security, is more flexible to use, provides saner options for custom setups, and is much simpler to setup and maintain. +In this new definition of `/usr`, the directory can be mounted read-only by default, +while the rootfs may be either read-write or read-only (for stateless systems) and contains only the empty mount point directories, +compat-symlinks to `/usr` and the host-specific data like `/etc`, `/root`, `/srv`. +In comparison to today's setups, the rootfs will be very small. +The host-specific data will be properly separated from the installed operating system. +The new `/usr` could also easily be shared read-only across several systems. +Such a setup would be more efficient, can provide additional security, is more flexible to use, +provides saner options for custom setups, and is much simpler to setup and maintain. -For more information on this please continue to [The Case for the /usr Merge](../THE_CASE_FOR_THE_USR_MERGE). +For more information on this please continue to [The Case for the /usr Merge](/THE_CASE_FOR_THE_USR_MERGE). diff --git a/docs/SYSLOG.md b/docs/SYSLOG.md index 35c6225..9999a8d 100644 --- a/docs/SYSLOG.md +++ b/docs/SYSLOG.md @@ -7,28 +7,56 @@ SPDX-License-Identifier: LGPL-2.1-or-later # Writing syslog Daemons Which Cooperate Nicely With systemd -Here are a few notes on things to keep in mind when you work on a classic BSD syslog daemon for Linux, to ensure that your syslog daemon works nicely together with systemd. If your syslog implementation does not follow these rules, then it will not be compatible with systemd v38 and newer. +Here are a few notes on things to keep in mind when you work on a classic BSD syslog daemon for Linux, to ensure that your syslog daemon works nicely together with systemd. +If your syslog implementation does not follow these rules, then it will not be compatible with systemd v38 and newer. -A few notes in advance: systemd centralizes all log streams in the Journal daemon. Messages coming in via /dev/log, via the native protocol, via STDOUT/STDERR of all services and via the kernel are received in the journal daemon. The journal daemon then stores them to disk or in RAM (depending on the configuration of the Storage= option in journald.conf), and optionally forwards them to the console, the kernel log buffer, or to a classic BSD syslog daemon -- and that's where you come in. +A few notes in advance: systemd centralizes all log streams in the Journal daemon. +Messages coming in via /dev/log, via the native protocol, via STDOUT/STDERR of all services and via the kernel are received in the journal daemon. -Note that it is now the journal that listens on /dev/log, no longer the BSD syslog daemon directly. If your logging daemon wants to get access to all logging data then it should listen on /run/systemd/journal/syslog instead via the syslog.socket unit file that is shipped along with systemd. On a systemd system it is no longer OK to listen on /dev/log directly, and your daemon may not bind to the /run/systemd/journal/syslog socket on its own. If you do that then you will lose logging from STDOUT/STDERR of services (as well as other stuff). +The journal daemon then stores them to disk or in RAM (depending on the configuration of the Storage= option in journald.conf), and optionally forwards them to the console, the kernel log buffer, or to a classic BSD syslog daemon -- and that's where you come in. -Your BSD compatible logging service should alias `syslog.service` to itself (i.e. symlink) when it is _enabled_. That way [syslog.socket](http://cgit.freedesktop.org/systemd/systemd/plain/units/syslog.socket) will activate your service when things are logged. Of course, only one implementation of BSD syslog can own that symlink, and hence only one implementation can be enabled at a time, but that's intended as there can only be one process listening on that socket. (see below for details how to manage this symlink.) Note that this means that syslog.socket as shipped with systemd is _shared_ among all implementations, and the implementation that is in control is configured with where syslog.service points to. +Note that it is now the journal that listens on /dev/log, no longer the BSD syslog daemon directly. +If your logging daemon wants to get access to all logging data then it should listen on /run/systemd/journal/syslog instead via the syslog.socket unit file that is shipped along with systemd. +On a systemd system it is no longer OK to listen on /dev/log directly, and your daemon may not bind to the /run/systemd/journal/syslog socket on its own. +If you do that then you will lose logging from STDOUT/STDERR of services (as well as other stuff). -Note that journald tries hard to forward to your BSD syslog daemon as much as it can. That means you will get more than you traditionally got on /dev/log, such as stuff all daemons log on STDOUT/STDERR and the messages that are logged natively to systemd. Also, we will send stuff like the original SCM_CREDENTIALS along if possible. +Your BSD compatible logging service should alias `syslog.service` to itself (i.e. symlink) when it is _enabled_. +That way [syslog.socket](http://cgit.freedesktop.org/systemd/systemd/plain/units/syslog.socket) will activate your service when things are logged. +Of course, only one implementation of BSD syslog can own that symlink, and hence only one implementation can be enabled at a time, but that's intended as there can only be one process listening on that socket. +(see below for details how to manage this symlink.) -(BTW, journald is smart enough not to forward the kernel messages it gets to you, you should read that on your own, directly from /proc/kmsg, as you always did. It's also smart enough never to forward kernel messages back to the kernel, but that probably shouldn't concern you too much...) +Note that this means that syslog.socket as shipped with systemd is _shared_ among all implementations, and the implementation that is in control is configured with where syslog.service points to. + +Note that journald tries hard to forward to your BSD syslog daemon as much as it can. +That means you will get more than you traditionally got on /dev/log, such as stuff all daemons log on STDOUT/STDERR and the messages that are logged natively to systemd. Also, we will send stuff like the original SCM_CREDENTIALS along if possible. + +(BTW, journald is smart enough not to forward the kernel messages it gets to you, you should read that on your own, directly from /proc/kmsg, as you always did. +It's also smart enough never to forward kernel messages back to the kernel, but that probably shouldn't concern you too much...) And here are the recommendations: -- First of all, make sure your syslog daemon installs a native service unit file (SysV scripts are not sufficient!) and is socket activatable. Newer systemd versions (v35+) do not support non-socket-activated syslog daemons anymore and we do no longer recommend people to order their units after syslog.target. That means that unless your syslog implementation is socket activatable many services will not be able to log to your syslog implementation and early boot messages are lost entirely to your implementation. Note that your service should install only one unit file, and nothing else. Do not install socket unit files. -- Make sure that in your unit file you set StandardOutput=null in the [Service] block. This makes sure that regardless what the global default for StandardOutput= is the output of your syslog implementation goes to /dev/null. This matters since the default StandardOutput= value for all units can be set to syslog and this should not create a feedback loop with your implementation where the messages your syslog implementation writes out are fed back to it. In other words: you need to explicitly opt out of the default standard output redirection we do for other services. (Also note that you do not need to set StandardError= explicitly, since that inherits the setting of StandardOutput= by default) +- First of all, make sure your syslog daemon installs a native service unit file (SysV scripts are not sufficient!) and is socket activatable. Newer systemd versions (v35+) do not support non-socket-activated syslog daemons anymore and we do no longer recommend people to order their units after syslog.target. +That means that unless your syslog implementation is socket activatable many services will not be able to log to your syslog implementation and early boot messages are lost entirely to your implementation. +Note that your service should install only one unit file, and nothing else. Do not install socket unit files. + +- Make sure that in your unit file you set StandardOutput=null in the [Service] block. +This makes sure that regardless what the global default for StandardOutput= is the output of your syslog implementation goes to /dev/null. +This matters since the default StandardOutput= value for all units can be set to syslog and this should not create a feedback loop with your implementation where the messages your syslog implementation writes out are fed back to it. +In other words: you need to explicitly opt out of the default standard output redirection we do for other services. +(Also note that you do not need to set StandardError= explicitly, since that inherits the setting of StandardOutput= by default) + - /proc/kmsg is your property, flush it to disk as soon as you start up. -- Name your service unit after your daemon (e.g. rsyslog.service or syslog-ng.service) and make sure to include Alias=syslog.service in your [Install] section in the unit file. This is ensures that the symlink syslog.service is created if your service is enabled and that it points to your service. Also add WantedBy=multi-user.target so that your service gets started at boot, and add Requires=syslog.socket in [Unit] so that you pull in the socket unit. + +- Name your service unit after your daemon (e.g. rsyslog.service or syslog-ng.service) and make sure to include Alias=syslog.service in your [Install] section in the unit file. +This is ensures that the symlink syslog.service is created if your service is enabled and that it points to your service. +Also add WantedBy=multi-user.target so that your service gets started at boot, and add Requires=syslog.socket in [Unit] so that you pull in the socket unit. Here are a few other recommendations, that are not directly related to systemd: -- Make sure to read the priority prefixes of the kmsg log messages the same way like from normal userspace syslog messages. When systemd writes to kmsg it will prefix all messages with valid priorities which include standard syslog facility values. OTOH for kernel messages the facility is always 0. If you need to know whether a message originated in the kernel rely on the facility value, not just on the fact that you read the message from /proc/kmsg! A number of userspace applications write messages to kmsg (systemd, udev, dracut, others), and they'll nowadays all set correct facility values. +- Make sure to read the priority prefixes of the kmsg log messages the same way like from normal userspace syslog messages. +When systemd writes to kmsg it will prefix all messages with valid priorities which include standard syslog facility values. OTOH for kernel messages the facility is always 0. +If you need to know whether a message originated in the kernel rely on the facility value, not just on the fact that you read the message from /proc/kmsg! A number of userspace applications write messages to kmsg (systemd, udev, dracut, others), and they'll nowadays all set correct facility values. + - When you read a message from the socket use SCM_CREDENTIALS to get information about the client generating it, and possibly patch the message with this data in order to make it impossible for clients to fake identities. The unit file you install for your service should look something like this: diff --git a/docs/SYSTEMD_FILE_HIERARCHY_REQUIREMENTS.md b/docs/SYSTEMD_FILE_HIERARCHY_REQUIREMENTS.md index 574df93..81d6f43 100644 --- a/docs/SYSTEMD_FILE_HIERARCHY_REQUIREMENTS.md +++ b/docs/SYSTEMD_FILE_HIERARCHY_REQUIREMENTS.md @@ -7,14 +7,26 @@ SPDX-License-Identifier: LGPL-2.1-or-later # systemd File Hierarchy Requirements -There are various attempts to standardize the file system hierarchy of Linux systems. In systemd we leave much of the file system layout open to the operating system, but here's what systemd strictly requires: +There are various attempts to standardize the file system hierarchy of Linux systems. +In systemd we leave much of the file system layout open to the operating system, but here's what systemd strictly requires: -- /, /usr, /etc must be mounted when the host systemd is first invoked. This may be achieved either by using the kernel's built-in root disk mounting (in which case /, /usr and /etc need to be on the same file system), or via an initrd, which could mount the three directories from different sources. -- /bin, /sbin, /lib (and /lib64 if applicable) should reside on /, or be symlinks to the /usr file system (recommended). All of them must be available before the host systemd is first executed. -- /var does not have to be mounted when the host systemd is first invoked, however, it must be configured so that it is mounted writable before local-fs.target is reached (for example, by simply listing it in /etc/fstab). -- /tmp is recommended to be a tmpfs (default), but doesn't have to. If configured, it must be mounted before local-fs.target is reached (for example, by listing it in /etc/fstab). -- /dev must exist as an empty mount point and will automatically be mounted by systemd with a devtmpfs. Non-devtmpfs boots are not supported. -- /proc and /sys must exist as empty mount points and will automatically be mounted by systemd with procfs and sysfs. -- /run must exist as an empty mount point and will automatically be mounted by systemd with a tmpfs. +- `/`, `/usr`, `/etc` must be mounted when the host systemd is first invoked. + This may be achieved either by using the kernel's built-in root disk mounting (in which case `/`, `/usr` and `/etc` need to be on the same file system), or via an initrd, which could mount the three directories from different sources. -The other directories usually found in the root directory (such as /home, /boot, /opt) are irrelevant to systemd. If they are defined they may be mounted from any source and at any time, though it is a good idea to mount them also before local-fs.target is reached. +- `/bin`, `/sbin`, `/lib` (and `/lib64` if applicable) should reside on `/`, or be symlinks to the `/usr` file system (recommended). + All of them must be available before the host systemd is first executed. + +- `/var` does not have to be mounted when the host systemd is first invoked, however, + it must be configured so that it is mounted writable before local-fs.target is reached (for example, by simply listing it in` /etc/fstab`). + +- `/tmp` is recommended to be a tmpfs (default), but doesn't have to. + If configured, it must be mounted before local-fs.target is reached (for example, by listing it in `/etc/fstab`). + +- `/dev` must exist as an empty mount point and will automatically be mounted by systemd with a devtmpfs. Non-devtmpfs boots are not supported. + +- `/proc` and `/sys` must exist as empty mount points and will automatically be mounted by systemd with procfs and sysfs. + +- `/run` must exist as an empty mount point and will automatically be mounted by systemd with a tmpfs. + +The other directories usually found in the root directory (such as `/home`, `/boot`, `/opt`) are irrelevant to systemd. +If they are defined they may be mounted from any source and at any time, though it is a good idea to mount them also before local-fs.target is reached. diff --git a/docs/THE_CASE_FOR_THE_USR_MERGE.md b/docs/THE_CASE_FOR_THE_USR_MERGE.md index 2cdb6db..c603e14 100644 --- a/docs/THE_CASE_FOR_THE_USR_MERGE.md +++ b/docs/THE_CASE_FOR_THE_USR_MERGE.md @@ -104,7 +104,7 @@ _With all vendor-supplied OS resources in a single directory /usr they may be sh **Myth #10**: The status quo of a split /usr with mounting it without initrd is perfectly well supported right now and works. -**Fact**: A split /usr without involvement of an initrd mounting it before jumping into the root file system [hasn't worked correctly since a long time](http://freedesktop.org/wiki/Software/systemd/separate-usr-is-broken). +**Fact**: A split /usr without involvement of an initrd mounting it before jumping into the root file system [hasn't worked correctly since a long time](/SEPARATE_USR_IS_BROKEN). **Myth #11**: Instead of merging / into /usr it would make a lot more sense to merge /usr into /. diff --git a/docs/TIPS_AND_TRICKS.md b/docs/TIPS_AND_TRICKS.md index f181f12..208f444 100644 --- a/docs/TIPS_AND_TRICKS.md +++ b/docs/TIPS_AND_TRICKS.md @@ -7,7 +7,7 @@ SPDX-License-Identifier: LGPL-2.1-or-later # Tips & Tricks -Also check out the [Frequently Asked Questions](http://www.freedesktop.org/wiki/Software/systemd/FrequentlyAskedQuestions)! +Also check out the [Frequently Asked Questions](/FAQ)! ## Listing running services @@ -155,13 +155,16 @@ $ psc $ ln -sf /usr/lib/systemd/system/multi-user.target /etc/systemd/system/default.target ``` -This line makes the multi user target (i.e. full system, but no graphical UI) the default target to boot into. This is kinda equivalent to setting runlevel 3 as the default runlevel on Fedora/sysvinit systems. +This line makes the multi user target (i.e. full system, but no graphical UI) the default target to boot into. +This is kinda equivalent to setting runlevel 3 as the default runlevel on Fedora/sysvinit systems. ```sh $ ln -sf /usr/lib/systemd/system/graphical.target /etc/systemd/system/default.target ``` -This line makes the graphical target (i.e. full system, including graphical UI) the default target to boot into. Kinda equivalent to runlevel 5 on fedora/sysvinit systems. This is how things are shipped by default. +This line makes the graphical target (i.e. full system, including graphical UI) the default target to boot into. +Kinda equivalent to runlevel 5 on fedora/sysvinit systems. +This is how things are shipped by default. ## What other units does a unit depend on? @@ -172,7 +175,8 @@ $ systemctl show -p "Wants" multi-user.target Wants=rc-local.service avahi-daemon.service rpcbind.service NetworkManager.service acpid.service dbus.service atd.service crond.service auditd.service ntpd.service udisks.service bluetooth.service cups.service wpa_supplicant.service getty.target modem-manager.service portreserve.service abrtd.service yum-updatesd.service upowerd.service test-first.service pcscd.service rsyslog.service haldaemon.service remote-fs.target plymouth-quit.service systemd-update-utmp-runlevel.service sendmail.service lvm2-monitor.service cpuspeed.service udev-post.service mdmonitor.service iscsid.service livesys.service livesys-late.service irqbalance.service iscsi.service netfs.service ``` -Instead of "Wants" you might also try "WantedBy", "Requires", "RequiredBy", "Conflicts", "ConflictedBy", "Before", "After" for the respective types of dependencies and their inverse. +Instead of "Wants" you might also try "WantedBy", "Requires", "RequiredBy", "Conflicts", "ConflictedBy", "Before", "After" +for the respective types of dependencies and their inverse. ## What would get started if I booted into a specific target? @@ -182,4 +186,6 @@ If you want systemd to calculate the "initial" transaction it would execute on b $ systemd --test --system --unit=foobar.target ``` -for a boot target foobar.target. Note that this is mostly a debugging tool that actually does a lot more than just calculate the initial transaction, so don't build scripts based on this. +for a boot target foobar.target. +Note that this is mostly a debugging tool that actually does a lot more than just calculate the initial transaction, +so don't build scripts based on this. diff --git a/docs/UIDS-GIDS.md b/docs/UIDS-GIDS.md index e84f037..4b28d95 100644 --- a/docs/UIDS-GIDS.md +++ b/docs/UIDS-GIDS.md @@ -10,11 +10,10 @@ SPDX-License-Identifier: LGPL-2.1-or-later Here's a summary of the requirements `systemd` (and Linux) make on UID/GID assignments and their ranges. -Note that while in theory UIDs and GIDs are orthogonal concepts they really -aren't IRL. With that in mind, when we discuss UIDs below it should be assumed -that whatever we say about UIDs applies to GIDs in mostly the same way, and all -the special assignments and ranges for UIDs always have mostly the same -validity for GIDs too. +Note that while in theory UIDs and GIDs are orthogonal concepts they really aren't IRL. +With that in mind, when we discuss UIDs below it should be assumed +that whatever we say about UIDs applies to GIDs in mostly the same way, +and all the special assignments and ranges for UIDs always have mostly the same validity for GIDs too. ## Special Linux UIDs @@ -23,28 +22,28 @@ i.e. 0…4294967295. However, four UIDs are special on Linux: 1. 0 → The `root` super-user. -2. 65534 → The `nobody` UID, also called the "overflow" UID or similar. It's - where various subsystems map unmappable users to, for example file systems - only supporting 16-bit UIDs, NFS or user namespacing. (The latter can be - changed with a sysctl during runtime, but that's not supported on - `systemd`. If you do change it you void your warranty.) Because Fedora is a - bit confused the `nobody` user is called `nfsnobody` there (and they have a - different `nobody` user at UID 99). I hope this will be corrected eventually - though. (Also, some distributions call the `nobody` group `nogroup`. I wish - they didn't.) +2. 65534 → The `nobody` UID, also called the "overflow" UID or similar. + It's where various subsystems map unmappable users to, for example file systems + only supporting 16-bit UIDs, NFS or user namespacing. + (The latter can be changed with a sysctl during runtime, but that's not supported on + `systemd`. If you do change it you void your warranty.) + Because Fedora is a bit confused the `nobody` user is called `nfsnobody` there + (and they have a different `nobody` user at UID 99). + I hope this will be corrected eventually though. + (Also, some distributions call the `nobody` group `nogroup`. I wish they didn't.) 3. 4294967295, aka "32-bit `(uid_t) -1`" → This UID is not a valid user ID, as `setresuid()`, `chown()` and friends treat -1 as a special request to not - change the UID of the process/file. This UID is hence not available for - assignment to users in the user database. + change the UID of the process/file. + This UID is hence not available for assignment to users in the user database. 4. 65535, aka "16-bit `(uid_t) -1`" → Before Linux kernel 2.4 `uid_t` used to be 16-bit, and programs compiled for that would hence assume that `(uid_t) -1` is 65535. This UID is hence not usable either. The `nss-systemd` glibc NSS module will synthesize user database records for -the UIDs 0 and 65534 if the system user database doesn't list them. This means -that any system where this module is enabled works to some minimal level +the UIDs 0 and 65534 if the system user database doesn't list them. +This means that any system where this module is enabled works to some minimal level without `/etc/passwd`. ## Special Distribution UID ranges @@ -58,8 +57,7 @@ Distributions generally split the available UID range in two: 2. 1000…65533 and 65536…4294967294 → Everything else, i.e. regular (human) users. Some older systems placed the boundary at 499/500, or even 99/100, -and some distributions allow the boundary between system and regular users to be changed -via local configuration. +and some distributions allow the boundary between system and regular users to be changed via local configuration. In `systemd`, the boundary is configurable during compilation time and is also queried from `/etc/login.defs` at runtime, if the `-Dcompat-mutable-uid-boundaries=true` compile-time setting is used. @@ -70,91 +68,88 @@ available regular user range only, usually 1000..60000. This range can also be configured using `/etc/login.defs`. Note that systemd requires that system users and groups are resolvable without -network — a requirement that is not made for regular users. This -means regular users may be stored in remote LDAP or NIS databases, but system -users may not (except when there's a consistent local cache kept, that is +network — a requirement that is not made for regular users. +This means regular users may be stored in remote LDAP or NIS databases, +but system users may not (except when there's a consistent local cache kept, that is available during earliest boot, including in the initrd). ## Special `systemd` GIDs -`systemd` defines no special UIDs beyond what Linux already defines (see -above). However, it does define some special group/GID assignments, which are -primarily used for `systemd-udevd`'s device management. The precise list of the -currently defined groups is found in this `sysusers.d` snippet: +`systemd` defines no special UIDs beyond what Linux already defines (see above). +However, it does define some special group/GID assignments, +which are primarily used for `systemd-udevd`'s device management. +The precise list of the currently defined groups is found in this `sysusers.d` snippet: [basic.conf](https://raw.githubusercontent.com/systemd/systemd/main/sysusers.d/basic.conf.in) It's strongly recommended that downstream distributions include these groups in their default group databases. Note that the actual GID numbers assigned to these groups do not have to be -constant beyond a specific system. There's one exception however: the `tty` -group must have the GID 5. That's because it must be encoded in the `devpts` -mount parameters during earliest boot, at a time where NSS lookups are not -possible. (Note that the actual GID can be changed during `systemd` build time, -but downstreams are strongly advised against doing that.) +constant beyond a specific system. +There's one exception however: the `tty` group must have the GID 5. +That's because it must be encoded in the `devpts` mount parameters during earliest boot, at a time where NSS lookups are not +possible. +(Note that the actual GID can be changed during `systemd` build time, but downstreams are strongly advised against doing that.) ## Special `systemd` UID ranges `systemd` defines a number of special UID ranges: 1. 60001…60513 → UIDs for home directories managed by - [`systemd-homed.service(8)`](https://www.freedesktop.org/software/systemd/man/systemd-homed.service.html). UIDs - from this range are automatically assigned to any home directory discovered, - and persisted locally on first login. On different systems the same user - might get different UIDs assigned in case of conflict, though it is + [`systemd-homed.service(8)`](https://www.freedesktop.org/software/systemd/man/systemd-homed.service.html). + UIDs from this range are automatically assigned to any home directory discovered, + and persisted locally on first login. + On different systems the same user might get different UIDs assigned in case of conflict, though it is attempted to make UID assignments stable, by deriving them from a hash of the user name. 2. 61184…65519 → UIDs for dynamic users are allocated from this range (see the `DynamicUser=` documentation in - [`systemd.exec(5)`](https://www.freedesktop.org/software/systemd/man/systemd.exec.html)). This - range has been chosen so that it is below the 16-bit boundary (i.e. below - 65535), in order to provide compatibility with container environments that - assign a 64K range of UIDs to containers using user namespacing. This range - is above the 60000 boundary, so that its allocations are unlikely to be - affected by `adduser` allocations (see above). And we leave some room - upwards for other purposes. (And if you wonder why precisely these numbers: - if you write them in hexadecimal, they might make more sense: 0xEF00 and - 0xFFEF). The `nss-systemd` module will synthesize user records implicitly - for all currently allocated dynamic users from this range. Thus, NSS-based - user record resolving works correctly without those users being in - `/etc/passwd`. + [`systemd.exec(5)`](https://www.freedesktop.org/software/systemd/man/systemd.exec.html)). + This range has been chosen so that it is below the 16-bit boundary + (i.e. below 65535), in order to provide compatibility with container environments that + assign a 64K range of UIDs to containers using user namespacing. + This range is above the 60000 boundary, so that its allocations are unlikely to be + affected by `adduser` allocations (see above). + And we leave some room upwards for other purposes. + (And if you wonder why precisely these numbers: if you write them in hexadecimal, they might make more sense: 0xEF00 and 0xFFEF). + The `nss-systemd` module will synthesize user records implicitly + for all currently allocated dynamic users from this range. + Thus, NSS-based user record resolving works correctly without those users being in `/etc/passwd`. 3. 524288…1879048191 → UID range for `systemd-nspawn`'s automatic allocation of - per-container UID ranges. When the `--private-users=pick` switch is used (or - `-U`) then it will automatically find a so far unused 16-bit subrange of this - range and assign it to the container. The range is picked so that the upper - 16-bit of the 32-bit UIDs are constant for all users of the container, while - the lower 16-bit directly encode the 65536 UIDs assigned to the - container. This mode of allocation means that the upper 16-bit of any UID + per-container UID ranges. + When the `--private-users=pick` switch is used (or `-U`) then it will automatically find a so far unused 16-bit subrange of this + range and assign it to the container. + The range is picked so that the upper 16-bit of the 32-bit UIDs are constant for all users of the container, + while the lower 16-bit directly encode the 65536 UIDs assigned to the container. + This mode of allocation means that the upper 16-bit of any UID assigned to a container are kind of a "container ID", while the lower 16-bit - directly expose the container's own UID numbers. If you wonder why precisely - these numbers, consider them in hexadecimal: 0x00080000…0x6FFFFFFF. This - range is above the 16-bit boundary. Moreover it's below the 31-bit boundary, - as some broken code (specifically: the kernel's `devpts` file system) - erroneously considers UIDs signed integers, and hence can't deal with values - above 2^31. The `systemd-machined.service` service will synthesize user - database records for all UIDs assigned to a running container from this - range. + directly expose the container's own UID numbers. + If you wonder why precisely these numbers, consider them in hexadecimal: 0x00080000…0x6FFFFFFF. + This range is above the 16-bit boundary. + Moreover it's below the 31-bit boundary, as some broken code (specifically: the kernel's `devpts` file system) + erroneously considers UIDs signed integers, and hence can't deal with values above 2^31. + The `systemd-machined.service` service will synthesize user database records for all UIDs assigned to a running container from this range. Note for both allocation ranges: when a UID allocation takes place NSS is -checked for collisions first, and a different UID is picked if an entry is -found. Thus, the user database is used as synchronization mechanism to ensure -exclusive ownership of UIDs and UID ranges. To ensure compatibility with other -subsystems allocating from the same ranges it is hence essential that they +checked for collisions first, and a different UID is picked if an entry is found. +Thus, the user database is used as synchronization mechanism to ensure +exclusive ownership of UIDs and UID ranges. +To ensure compatibility with other subsystems allocating from the same ranges it is hence essential that they ensure that whatever they pick shows up in the user/group databases, either by -providing an NSS module, or by adding entries directly to `/etc/passwd` and -`/etc/group`. For performance reasons, do note that `systemd-nspawn` will only -do an NSS check for the first UID of the range it allocates, not all 65536 of -them. Also note that while the allocation logic is operating, the glibc -`lckpwdf()` user database lock is taken, in order to make this logic race-free. +providing an NSS module, or by adding entries directly to `/etc/passwd` and `/etc/group`. +For performance reasons, do note that `systemd-nspawn` will only +do an NSS check for the first UID of the range it allocates, not all 65536 of them. +Also note that while the allocation logic is operating, +the glibc `lckpwdf()` user database lock is taken, in order to make this logic race-free. ## Figuring out the system's UID boundaries The most important boundaries of the local system may be queried with `pkg-config`: -``` +```sh $ pkg-config --variable=system_uid_max systemd 999 $ pkg-config --variable=dynamic_uid_min systemd @@ -172,9 +167,9 @@ pick — given that 64K UIDs are assigned to each container according to this allocation logic, the maximum UID used for this range is hence 1878982656+65535=1879048191.) -Systemd has compile-time default for these boundaries. Using those defaults is -recommended. It will nevertheless query `/etc/login.defs` at runtime, when -compiled with `-Dcompat-mutable-uid-boundaries=true` and that file is present. +Systemd has compile-time default for these boundaries. +Using those defaults is recommended. +It will nevertheless query `/etc/login.defs` at runtime, when compiled with `-Dcompat-mutable-uid-boundaries=true` and that file is present. Support for this is considered only a compatibility feature and should not be used except when upgrading systems which were created with different defaults. @@ -183,75 +178,71 @@ used except when upgrading systems which were created with different defaults. If you hack on a container manager, and wonder how and how many UIDs best to assign to your containers, here are a few recommendations: -1. Definitely, don't assign less than 65536 UIDs/GIDs. After all the `nobody` -user has magic properties, and hence should be available in your container, and -given that it's assigned the UID 65534, you should really cover the full 16-bit -range in your container. Note that systemd will — as mentioned — synthesize -user records for the `nobody` user, and assumes its availability in various -other parts of its codebase, too, hence assigning fewer users means you lose -compatibility with running systemd code inside your container. And most likely -other packages make similar restrictions. +1. Definitely, don't assign less than 65536 UIDs/GIDs. +After all the `nobody` user has magic properties, and hence should be available in your container, +and given that it's assigned the UID 65534, you should really cover the full 16-bit range in your container. +Note that systemd will — as mentioned — synthesize user records for the `nobody` user, +and assumes its availability in various other parts of its codebase, too, hence assigning fewer users means you lose +compatibility with running systemd code inside your container. +And most likely other packages make similar restrictions. 2. While it's fine to assign more than 65536 UIDs/GIDs to a container, there's most likely not much value in doing so, as Linux distributions won't use the higher ranges by default (as mentioned neither `adduser` nor `systemd`'s -dynamic user concept allocate from above the 16-bit range). Unless you actively -care for nested containers, it's hence probably a good idea to allocate exactly -65536 UIDs per container, and neither less nor more. A pretty side-effect is -that by doing so, you expose the same number of UIDs per container as Linux 2.2 +dynamic user concept allocate from above the 16-bit range). +Unless you actively care for nested containers, it's hence probably a good idea to allocate exactly +65536 UIDs per container, and neither less nor more. +A pretty side-effect is that by doing so, you expose the same number of UIDs per container as Linux 2.2 supported for the whole system, back in the days. -3. Consider allocating UID ranges for containers so that the first UID you -assign has the lower 16-bits all set to zero. That way, the upper 16-bits become -a container ID of some kind, while the lower 16-bits directly encode the -internal container UID. This is the way `systemd-nspawn` allocates UID ranges -(see above). Following this allocation logic ensures best compatibility with -`systemd-nspawn` and all other container managers following the scheme, as it -is sufficient then to check NSS for the first UID you pick regarding conflicts, -as that's what they do, too. Moreover, it makes `chown()`ing container file -system trees nicely robust to interruptions: as the external UID encodes the +3. Consider allocating UID ranges for containers so that the first UID you assign has the lower 16-bits all set to zero. +That way, the upper 16-bits become a container ID of some kind, +while the lower 16-bits directly encode the internal container UID. +This is the way `systemd-nspawn` allocates UID ranges(see above). +Following this allocation logic ensures best compatibility with `systemd-nspawn` +and all other container managers following the scheme, as it +is sufficient then to check NSS for the first UID you pick regarding conflicts, as that's what they do, too. +Moreover, it makes `chown()`ing container file system trees nicely robust to interruptions: as the external UID encodes the internal UID in a fixed way, it's very easy to adjust the container's base UID without the need to know the original base UID: to change the container base, -just mask away the upper 16-bit, and insert the upper 16-bit of the new container -base instead. Here are the easy conversions to derive the internal UID, the -external UID, and the container base UID from each other: +just mask away the upper 16-bit, and insert the upper 16-bit of the new container base instead. +Here are the easy conversions to derive the internal UID, the external UID, and the container base UID from each other: - ``` - INTERNAL_UID = EXTERNAL_UID & 0x0000FFFF - CONTAINER_BASE_UID = EXTERNAL_UID & 0xFFFF0000 - EXTERNAL_UID = INTERNAL_UID | CONTAINER_BASE_UID - ``` +```sh +INTERNAL_UID = EXTERNAL_UID & 0x0000FFFF +CONTAINER_BASE_UID = EXTERNAL_UID & 0xFFFF0000 +EXTERNAL_UID = INTERNAL_UID | CONTAINER_BASE_UID +``` 4. When picking a UID range for containers, make sure to check NSS first, with a simple `getpwuid()` call: if there's already a user record for the first UID -you want to pick, then it's already in use: pick a different one. Wrap that -call in a `lckpwdf()` + `ulckpwdf()` pair, to make allocation -race-free. Provide an NSS module that makes all UIDs you end up taking show up +you want to pick, then it's already in use: pick a different one. +Wrap that call in a `lckpwdf()` + `ulckpwdf()` pair, to make allocation race-free. +Provide an NSS module that makes all UIDs you end up taking show up in the user database, and make sure that the NSS module returns up-to-date information before you release the lock, so that other system components can -safely use the NSS user database as allocation check, too. Note that if you -follow this scheme no changes to `/etc/passwd` need to be made, thus minimizing +safely use the NSS user database as allocation check, too. +Note that if you follow this scheme no changes to `/etc/passwd` need to be made, thus minimizing the artifacts the container manager persistently leaves in the system. -5. `systemd-homed` by default mounts the home directories it manages with UID -mapping applied. It will map four UID ranges into that uidmap, and leave -everything else unmapped: the range from 0…60000, the user's own UID, the range -60514…65534, and the container range 524288…1879048191. This means -files/directories in home directories managed by `systemd-homed` cannot be +5. `systemd-homed` by default mounts the home directories it manages with UID mapping applied. +It will map four UID ranges into that uidmap, and leave everything else unmapped: +the range from 0…60000, the user's own UID, +the range 60514…65534, and the container range 524288…1879048191. +This means files/directories in home directories managed by `systemd-homed` cannot be owned by UIDs/GIDs outside of these four ranges (attempts to `chown()` files to -UIDs outside of these ranges will fail). Thus, if container trees are to be -placed within a home directory managed by `systemd-homed` they should take -these ranges into consideration and either place the trees at base UID 0 (and -then map them to a higher UID range for use in user namespacing via another -level of UID mapped mounts, at *runtime*) or at a base UID from the container -UID range. That said, placing container trees (and in fact any -files/directories not owned by the home directory's user) in home directories +UIDs outside of these ranges will fail). +Thus, if container trees are to be placed within a home directory managed by `systemd-homed` they should take +these ranges into consideration and either place the trees at base UID 0 +(and then map them to a higher UID range for use in user namespacing via another +level of UID mapped mounts, at *runtime*) or at a base UID from the container UID range. +That said, placing container trees (and in fact any files/directories not owned by the home directory's user) in home directories is generally a questionable idea (regardless of whether `systemd-homed` is used or not), given this typically breaks quota assumptions, makes it impossible for users to properly manage all files in their own home directory due to permission problems, introduces security issues around SETUID and severely -restricts compatibility with networked home directories. Typically, it's a much -better idea to place container images outside of the home directory, +restricts compatibility with networked home directories. +Typically, it's a much better idea to place container images outside of the home directory, i.e. somewhere below `/var/` or similar. ## Summary @@ -276,51 +267,47 @@ i.e. somewhere below `/var/` or similar. | 2147483648…4294967294 | HIC SVNT LEONES | | | | 4294967295 | 32-bit `(uid_t) -1` | Linux | | -Note that "Unused" in the table above doesn't mean that these ranges are -really unused. It just means that these ranges have no well-established -pre-defined purposes between Linux, generic low-level distributions and -`systemd`. There might very well be other packages that allocate from these -ranges. +Note that "Unused" in the table above doesn't mean that these ranges are really unused. +It just means that these ranges have no well-established +pre-defined purposes between Linux, generic low-level distributions and `systemd`. +There might very well be other packages that allocate from theseranges. -Note that the range 2147483648…4294967294 (i.e. 2^31…2^32-2) should be handled -with care. Various programs (including kernel file systems — see `devpts` — or +Note that the range 2147483648…4294967294 (i.e. 2^31…2^32-2) should be handled with care. +Various programs (including kernel file systems — see `devpts` — or even kernel syscalls – see `setfsuid()`) have trouble with UIDs outside of the -signed 32-bit range, i.e any UIDs equal to or above 2147483648. It is thus -strongly recommended to stay away from this range in order to avoid -complications. This range should be considered reserved for future, special -purposes. +signed 32-bit range, i.e any UIDs equal to or above 2147483648. +It is thus strongly recommended to stay away from this range in order to avoid complications. +This range should be considered reserved for future, special purposes. ## Notes on resolvability of user and group names User names, UIDs, group names and GIDs don't have to be resolvable using NSS -(i.e. getpwuid() and getpwnam() and friends) all the time. However, systemd -makes the following requirements: +(i.e. getpwuid() and getpwnam() and friends) all the time. +However, systemd makes the following requirements: -System users generally have to be resolvable during early boot already. This -means they should not be provided by any networked service (as those usually +System users generally have to be resolvable during early boot already. +This means they should not be provided by any networked service (as those usually become available during late boot only), except if a local cache is kept that -makes them available during early boot too (i.e. before networking is -up). Specifically, system users need to be resolvable at least before -`systemd-udevd.service` and `systemd-tmpfiles-setup.service` are started, as both -need to resolve system users — but note that there might be more services +makes them available during early boot too (i.e. before networking is up). +Specifically, system users need to be resolvable at least before +`systemd-udevd.service` and `systemd-tmpfiles-setup.service` are started, +as both need to resolve system users — but note that there might be more services requiring full resolvability of system users than just these two. Regular users do not need to be resolvable during early boot, it is sufficient -if they become resolvable during late boot. Specifically, regular users need to -be resolvable at the point in time the `nss-user-lookup.target` unit is -reached. This target unit is generally used as synchronization point between -providers of the user database and consumers of it. Services that require that -the user database is fully available (for example, the login service +if they become resolvable during late boot. +Specifically, regular users need to be resolvable at the point in time the `nss-user-lookup.target` unit is reached. +This target unit is generally used as synchronization point between +providers of the user database and consumers of it. +Services that require that the user database is fully available (for example, the login service `systemd-logind.service`) are ordered *after* it, while services that provide -parts of the user database (for example an LDAP user database client) are -ordered *before* it. Note that `nss-user-lookup.target` is a *passive* unit: in +parts of the user database (for example an LDAP user database client) are ordered *before* it. +Note that `nss-user-lookup.target` is a *passive* unit: in order to minimize synchronization points on systems that don't need it the unit is pulled into the initial transaction only if there's at least one service that really needs it, and that means only if there's a service providing the -local user database somehow through IPC or suchlike. Or in other words: if you -hack on some networked user database project, then make sure you order your -service `Before=nss-user-lookup.target` and that you pull it in with -`Wants=nss-user-lookup.target`. However, if you hack on some project that needs -the user database to be up in full, then order your service -`After=nss-user-lookup.target`, but do *not* pull it in via a `Wants=` -dependency. +local user database somehow through IPC or suchlike. +Or in other words: if you hack on some networked user database project, then make sure you order your +service `Before=nss-user-lookup.target` and that you pull it in with `Wants=nss-user-lookup.target`. +However, if you hack on some project that needs the user database to be up in full, then order your service +`After=nss-user-lookup.target`, but do *not* pull it in via a `Wants=` dependency. diff --git a/docs/USERDB_AND_DESKTOPS.md b/docs/USERDB_AND_DESKTOPS.md index 3a3da13..b9a9eac 100644 --- a/docs/USERDB_AND_DESKTOPS.md +++ b/docs/USERDB_AND_DESKTOPS.md @@ -9,39 +9,37 @@ SPDX-License-Identifier: LGPL-2.1-or-later Starting with version 245, systemd supports a new subsystem [`systemd-homed.service`](https://www.freedesktop.org/software/systemd/man/systemd-homed.service.html) -for managing regular ("human") users and their home directories. Along with it -a new concept `userdb` got merged that brings rich, extensible JSON user/group -records, extending the classic UNIX/glibc NSS `struct passwd`/`struct group` -structures. Both additions are added in a fully backwards compatible way, -accessible through `getpwnam()`/`getgrnam()`/… (i.e. libc NSS) and PAM as +for managing regular ("human") users and their home directories. +Along with it a new concept `userdb` got merged that brings rich, extensible JSON user/group +records, extending the classic UNIX/glibc NSS `struct passwd`/`struct group` structures. +Both additions are added in a fully backwards compatible way, accessible through `getpwnam()`/`getgrnam()`/… (i.e. libc NSS) and PAM as usual, meaning that for basic support no changes in the upper layers of the -stack (in particular desktop environments, such as GNOME or KDE) have to be -made. However, for better support a number of changes to desktop environments -are recommended. A few areas where that applies are discussed below. +stack (in particular desktop environments, such as GNOME or KDE) have to be made. +However, for better support a number of changes to desktop environments are recommended. +A few areas where that applies are discussed below. Before reading on, please read up on the basic concepts, specifically: -* [Home Directories](HOME_DIRECTORY) -* [JSON User Records](USER_RECORD) -* [JSON Group Records](GROUP_RECORD) -* [User/Group Record Lookup API via Varlink](USER_GROUP_API) +* [Home Directories](/HOME_DIRECTORY) +* [JSON User Records](/USER_RECORD) +* [JSON Group Records](/GROUP_RECORD) +* [User/Group Record Lookup API via Varlink](/USER_GROUP_API) ## Support for Suspending Home Directory Access during System Suspend One key feature of `systemd-homed` managed encrypted home directories is the ability that access to them can be suspended automatically during system sleep, -removing any cryptographic key material from memory while doing so. This is -important in a world where most laptop users seldom shut down their computers -but most of the time just suspend them instead. Previously, the encryption keys -for the home directories remained in memory during system suspend, so that -sufficiently equipped attackers could read them from there and gain full access -to the device. By removing the key material from memory before suspend, and -re-requesting it on resume this attack vector can be closed down effectively. +removing any cryptographic key material from memory while doing so. +This is important in a world where most laptop users seldom shut down their computers +but most of the time just suspend them instead. +Previously, the encryption keys for the home directories remained in memory during system suspend, so that +sufficiently equipped attackers could read them from there and gain full access to the device. +By removing the key material from memory before suspend, and re-requesting it on resume this attack vector can be closed down effectively. Supporting this mechanism requires support in the desktop environment, since the encryption keys (i.e. the user's login password) need to be reacquired on -system resume, from a lock screen or similar. This lock screen must run in -system context, and cannot run in the user's own context, since otherwise it +system resume, from a lock screen or similar. +This lock screen must run in system context, and cannot run in the user's own context, since otherwise it might end up accessing the home directory of the user even though access to it is temporarily suspended and thus will hang if attempted. @@ -49,91 +47,86 @@ It is suggested that desktop environments that implement lock screens run them from system context, for example by switching back to the display manager, and only revert back to the session after re-authentication via this system lock screen (re-authentication in this case refers to passing the user's login -credentials to the usual PAM authentication hooks). Or in other words, when -going into system suspend it is recommended that GNOME Shell switches back to +credentials to the usual PAM authentication hooks). +Or in other words, when going into system suspend it is recommended that GNOME Shell switches back to the GNOME Display Manager login screen which now should double as screen lock, and only switches back to the shell's UI after the user re-authenticated there. Note that this change in behavior is a good idea in any case, and does not -create any dependencies on `systemd-homed` or systemd-specific APIs. It's -simply a change of behavior regarding use of existing APIs, not a suggested -hook-up to any new APIs. +create any dependencies on `systemd-homed` or systemd-specific APIs. +It's simply a change of behavior regarding use of existing APIs, not a suggested hook-up to any new APIs. A display manager which supports this kind of out-of-context screen lock operation needs to inform systemd-homed about this so that systemd-homed knows -that it is safe to suspend the user's home directory on suspend. This is done -via the `suspend=` argument to the +that it is safe to suspend the user's home directory on suspend. +This is done via the `suspend=` argument to the [`pam_systemd_home`](https://www.freedesktop.org/software/systemd/man/pam_systemd_home.html) -PAM module. A display manager should hence change its PAM stack configuration -to set this parameter to on. `systemd-homed` will not suspend home directories -if there's at least one active session of the user that does not support +PAM module. +A display manager should hence change its PAM stack configurationto set this parameter to on. +`systemd-homed` will not suspend home directories if there's at least one active session of the user that does not support suspending, as communicated via this parameter. ## User Management UIs The rich user/group records `userdb` and `systemd-homed` support carry various -fields of relevance to UIs that manage the local user database or parts -thereof. In particular, most of the metadata `accounts-daemon` (also see below) -supports is directly available in these JSON records. Hence it makes sense for -any user management UI to expose them directly. +fields of relevance to UIs that manage the local user database or parts thereof. +In particular, most of the metadata `accounts-daemon` (also see below) +supports is directly available in these JSON records. +Hence it makes sense for any user management UI to expose them directly. `systemd-homed` exposes APIs to add, remove and make changes to local users via -D-Bus, with full [polkit](https://www.freedesktop.org/software/polkit/docs/latest/) -hook-up. On the command line this is exposed via the -`homectl` command. A graphical UI that exposes similar functionality would be +D-Bus, with full [polkit](https://www.freedesktop.org/software/polkit/docs/latest/) hook-up. +On the command line this is exposed via the `homectl` command. A graphical UI that exposes similar functionality would be very useful, exposing the various new account settings, and in particular providing a stream-lined UI for enrolling new-style authentication tokens such -as PKCS#11/YubiKey-style devices. (Ideally, if the user plugs in an -uninitialized YubiKey during operation it might be nice if the Desktop would +as PKCS#11/YubiKey-style devices. +(Ideally, if the user plugs in an uninitialized YubiKey during operation it might be nice if the Desktop would automatically ask if a key pair shall be written to it and the local account be bound to it, `systemd-homed` provides enough YubiKey/PKCS#11 support to make this a reality today; except that it will not take care of token initialization). -A strong point of `systemd-homed` is per-user resource management. In -particular disk space assignments are something that most likely should be +A strong point of `systemd-homed` is per-user resource management. +In particular disk space assignments are something that most likely should be exposed in a user management UI. Various metadata fields are supplied allowing -exposure of disk space assignment "slider" UI. Note however that the file system -back-ends of `systemd-homed.service` have different feature sets. Specifically, -only btrfs has online file system shrinking support, ext4 only offline file +exposure of disk space assignment "slider" UI. +Note however that the file system back-ends of `systemd-homed.service` have different feature sets. +Specifically, only btrfs has online file system shrinking support, ext4 only offline file system shrinking support, and xfs no shrinking support at all (all three file -systems support online file system growing however). This means if the LUKS -back-end is used, disk space assignment cannot be instant for logged in users, -unless btrfs is used. - -Note that only `systemd-homed` provides an API for modifying/creating/deleting -users. The generic `userdb` subsystem (which might have other back-ends, besides -`systemd-homed`, for example LDAP or Windows) exclusively provides a read-only -interface. (This is unlikely to change, as the other back-ends might have very -different concepts of adding or modifying users, i.e. might not even have any -local concept for that at all). This means any user management UI that intends -to change (and not just view) user accounts should talk directly to +systems support online file system growing however). +This means if the LUKS back-end is used, disk space assignment cannot be instant for logged in users, unless btrfs is used. + +Note that only `systemd-homed` provides an API for modifying/creating/deleting users. +The generic `userdb` subsystem (which might have other back-ends, besides +`systemd-homed`, for example LDAP or Windows) exclusively provides a read-only interface. +(This is unlikely to change, as the other back-ends might have very +different concepts of adding or modifying users, i.e. might not even have any local concept for that at all). +This means any user management UI that intends to change (and not just view) user accounts should talk directly to `systemd-homed` to make use of its features; there's no abstraction available to support other back-ends under the same API. -Unfortunately there's currently no documentation for the `systemd-homed` D-Bus -API. Consider using the `homectl` sources as guidelines for implementing a user -management UI. The JSON user/records are well documented however, see above, +Unfortunately there's currently no documentation for the `systemd-homed` D-Bus API. +Consider using the `homectl` sources as guidelines for implementing a user management UI. +The JSON user/records are well documented however, see above, and the D-Bus API provides limited introspection. ## Relationship to `accounts-daemon` For a long time `accounts-daemon` has been included in Linux distributions -providing richer user accounts. The functionality of this daemon overlaps in -many areas with the functionality of `systemd-homed` or `userdb`, but there are +providing richer user accounts. +The functionality of this daemon overlaps in many areas with the functionality of `systemd-homed` or `userdb`, but there are systematic differences, which means that `systemd-homed` cannot replace -`accounts-daemon` fully. Most importantly: `accounts-daemon` provides -"side-car" metadata for *any* type of user account, while `systemd-homed` only -provides additional metadata for the users it defines itself. In other words: -`accounts-daemon` will augment foreign accounts; `systemd-homed` cannot be used -to augment users defined elsewhere, for example in LDAP or as classic -`/etc/passwd` records. +`accounts-daemon` fully. +Most importantly: `accounts-daemon` provides "side-car" metadata for *any* type of user account, while `systemd-homed` only +provides additional metadata for the users it defines itself. +In other words: `accounts-daemon` will augment foreign accounts; `systemd-homed` cannot be used +to augment users defined elsewhere, for example in LDAP or as classic `/etc/passwd` records. This probably means that for the time being, a user management UI (or other UI) that wants to support rich user records with compatibility with the status quo ante should probably talk to both `systemd-homed` and `accounts-daemon` at the -same time, and ignore `accounts-daemon`'s records if `systemd-homed` defines -them. While I (Lennart) personally believe in the long run `systemd-homed` is +same time, and ignore `accounts-daemon`'s records if `systemd-homed` defines them. +While I (Lennart) personally believe in the long run `systemd-homed` is the way to go for rich user records, any UI that wants to manage and support rich records for classic records has to support `accounts-daemon` in parallel for the time being. @@ -145,17 +138,16 @@ probably be removed from the general stack, hence this sounds like a temporary solution only. In case you wonder, there's no automatic mechanism for converting existing -users registered in `/etc/passwd` or LDAP to users managed by -`systemd-homed`. There's documentation for doing this manually though, see -[Converting Existing Users to systemd-homed managed Users](CONVERTING_TO_HOMED). +users registered in `/etc/passwd` or LDAP to users managed by `systemd-homed`. +There's documentation for doing this manually though, see +[Converting Existing Users to systemd-homed managed Users](/CONVERTING_TO_HOMED). ## Future Additions -JSON user/group records are extensible, hence we can easily add any additional -fields desktop environments require. For example, pattern-based authentication -is likely very useful on touch-based devices, and the user records should hence -learn them natively. Fields for other authentication mechanisms, such as -fingerprint authentication should be provided as well, eventually. +JSON user/group records are extensible, hence we can easily add any additional fields desktop environments require. +For example, pattern-based authentication is likely very useful on touch-based devices, +and the user records should hence learn them natively. +Fields for other authentication mechanisms, such as fingerprint authentication should be provided as well, eventually. It is planned to extend the `userdb` Varlink API to support look-ups by partial user name and real name (GECOS) data, so that log-in screens can optionally @@ -163,7 +155,7 @@ implement simple complete-as-you-type login screens. It is planned to extend the `systemd-homed` D-Bus API to instantly inform clients about hardware associated with a specific user being plugged in, to which login -screens can listen in order to initiate authentication. Specifically, any -YubiKey-like security token plugged in that is associated with a local user +screens can listen in order to initiate authentication. +Specifically, any YubiKey-like security token plugged in that is associated with a local user record should initiate authentication for that user, making typing in of the username unnecessary. diff --git a/docs/USER_GROUP_API.md b/docs/USER_GROUP_API.md index 567b817..d0016db 100644 --- a/docs/USER_GROUP_API.md +++ b/docs/USER_GROUP_API.md @@ -7,21 +7,22 @@ SPDX-License-Identifier: LGPL-2.1-or-later # User/Group Record Lookup API via Varlink -JSON User/Group Records (as described in the [JSON User Records](USER_RECORD) -and [JSON Group Records](GROUP_RECORD) documents) that are defined on the -local system may be queried with a [Varlink](https://varlink.org/) API. This -API takes both the role of what +JSON User/Group Records (as described in the [JSON User Records](/USER_RECORD) +and [JSON Group Records](/GROUP_RECORD) documents) that are defined on the +local system may be queried with a [Varlink](https://varlink.org/) API. +This API takes both the role of what [`getpwnam(3)`](https://man7.org/linux/man-pages/man3/getpwnam.3.html) and related calls are for `struct passwd`, as well as the interfaces modules implementing the [glibc Name Service Switch -(NSS)](https://www.gnu.org/software/libc/manual/html_node/Name-Service-Switch.html) -expose. Or in other words, it both allows applications to efficiently query +(NSS)](https://www.gnu.org/software/libc/manual/html_node/Name-Service-Switch.html) expose. +Or in other words, it both allows applications to efficiently query user/group records from local services, and allows local subsystems to provide user/group records efficiently to local applications. -The concepts described here define an IPC interface. Alternatively, user/group -records may be dropped in number of drop-in directories as files where they are -picked up in addition to the users/groups defined by this IPC logic. See +The concepts described here define an IPC interface. +Alternatively, user/group records may be dropped in number of drop-in directories as files where they are +picked up in addition to the users/groups defined by this IPC logic. +See [`nss-systemd(8)`](https://www.freedesktop.org/software/systemd/man/nss-systemd.html) for details. @@ -31,58 +32,55 @@ subset of the Varlink functionality. ## Why Varlink? The API described in this document is based on a simple subset of the -mechanisms described by [Varlink](https://varlink.org/). The choice of -preferring Varlink over D-Bus and other IPCs in this context was made for three -reasons: +mechanisms described by [Varlink](https://varlink.org/). +The choice of preferring Varlink over D-Bus and other IPCs in this context was made for three reasons: 1. User/Group record resolution should work during early boot and late shutdown - without special handling. This is very hard to do with D-Bus, as the broker - service for D-Bus generally runs as regular system daemon and is hence only + without special handling. + This is very hard to do with D-Bus, as the broker service for D-Bus generally runs as regular system daemon and is hence only available at the latest boot stage. 2. The JSON user/group records are native JSON data, hence picking an IPC system that natively operates with JSON data is natural and clean. 3. IPC systems such as D-Bus do not provide flow control and are thus unusable - for streaming data. They are useful to pass around short control messages, - but as soon as potentially many and large objects shall be transferred, + for streaming data. + They are useful to pass around short control messages, but as soon as potentially many and large objects shall be transferred, D-Bus is not suitable, as any such streaming of messages would be considered - flooding in D-Bus' logic, and thus possibly result in termination of - communication. Since the APIs defined in this document need to support - enumerating potentially large numbers of users and groups, D-Bus is simply - not an appropriate option. + flooding in D-Bus' logic, and thus possibly result in termination of communication. + Since the APIs defined in this document need to support enumerating potentially large numbers of users and groups, + D-Bus is simply not an appropriate option. ## Concepts Each subsystem that needs to define users and groups on the local system is supposed to implement this API, and offer its interfaces on a Varlink `AF_UNIX`/`SOCK_STREAM` file system socket bound into the -`/run/systemd/userdb/` directory. When a client wants to look up a user or -group record, it contacts all sockets bound in this directory in parallel, and -enqueues the same query to each. The first positive reply is then returned to -the application, or if all fail the last seen error is returned -instead. (Alternatively a special Varlink service is available, +`/run/systemd/userdb/` directory. +When a client wants to look up a user or group record, it contacts all sockets bound in this directory in parallel, +and enqueues the same query to each. +The first positive reply is then returned to the application, or if all fail the last seen error is returned instead. +(Alternatively a special Varlink service is available, `io.systemd.Multiplexer` which acts as frontend and will do the parallel -queries on behalf of the client, drastically simplifying client -development. This service is not available during earliest boot and final -shutdown phases.) +queries on behalf of the client, drastically simplifying client development. +This service is not available during earliest boot and final shutdown phases.) Unlike with glibc NSS there's no order or programmatic expression language -defined in which queries are issued to the various services. Instead, all -queries are always enqueued in parallel to all defined services, in order to +defined in which queries are issued to the various services. +Instead, all queries are always enqueued in parallel to all defined services, in order to make look-ups efficient, and the simple rule of "first successful lookup wins" is unconditionally followed for user and group look-ups (though not for membership lookups, see below). This simple scheme only works safely as long as every service providing -user/group records carefully makes sure not to answer with conflicting -records. This API does not define any mechanisms for dealing with user/group -name/ID collisions during look-up nor during record registration. It assumes -the various subsystems that want to offer user and group records to the rest of +user/group records carefully makes sure not to answer with conflicting records. +This API does not define any mechanisms for dealing with user/group +name/ID collisions during look-up nor during record registration. +It assumes the various subsystems that want to offer user and group records to the rest of the system have made sufficiently sure in advance that their definitions do not -collide with those of other services. Clients are not expected to merge -multiple definitions for the same user or group, and will also not be able to -detect conflicts and suppress such conflicting records. +collide with those of other services. +Clients are not expected to merge multiple definitions for the same user or group, +and will also not be able to detect conflicts and suppress such conflicting records. It is recommended to name the sockets in the directory in reverse domain name notation, but this is neither required nor enforced. @@ -90,27 +88,25 @@ notation, but this is neither required nor enforced. ## Well-Known Services Any subsystem that wants to provide user/group records can do so, simply by -binding a socket in the aforementioned directory. By default two -services are listening there, that have special relevance: +binding a socket in the aforementioned directory. +By default two services are listening there, that have special relevance: 1. `io.systemd.NameServiceSwitch` → This service makes the classic UNIX/glibc - NSS user/group records available as JSON User/Group records. Any such - records are automatically converted as needed, and possibly augmented with + NSS user/group records available as JSON User/Group records. + Any such records are automatically converted as needed, and possibly augmented with information from the shadow databases. -2. `io.systemd.Multiplexer` → This service multiplexes client queries to all - other running services. It's supposed to simplify client development: in - order to look up or enumerate user/group records it's sufficient to talk to - one service instead of all of them in parallel. Note that it is not available - during earliest boot and final shutdown phases, hence for programs running - in that context it is preferable to implement the parallel lookup - themselves. +2. `io.systemd.Multiplexer` → This service multiplexes client queries to all other running services. + It's supposed to simplify client development: in order to look up or enumerate user/group records it's sufficient to talk to + one service instead of all of them in parallel. + Note that it is not available during earliest boot and final shutdown phases, hence for programs running + in that context it is preferable to implement the parallel lookup themselves. Both these services are implemented by the same daemon `systemd-userdbd.service`. -Note that these services currently implement a subset of Varlink only. For -example, introspection is not available, and the resolver logic is not used. +Note that these services currently implement a subset of Varlink only. +For example, introspection is not available, and the resolver logic is not used. ## Other Services @@ -129,35 +125,33 @@ interface. Specifically: `systemd-machined.service` and provides records for the users and groups used by local containers that use user namespacing. -Other projects are invited to implement these services too. For example, it -would make sense for LDAP/ActiveDirectory projects to implement these +Other projects are invited to implement these services too. +For example, it would make sense for LDAP/ActiveDirectory projects to implement these interfaces, which would provide them a way to do per-user resource management enforced by systemd and defined directly in LDAP directories. ## Compatibility with NSS -Two-way compatibility with classic UNIX/glibc NSS user/group records is -provided. When using the Varlink API, lookups into databases provided only via -NSS (and not natively via Varlink) are handled by the -`io.systemd.NameServiceSwitch` service (see above). When using the NSS API -(i.e. `getpwnam()` and friends) the `nss-systemd` module will automatically -synthesize NSS records for users/groups natively defined via a Varlink -API. Special care is taken to avoid recursion between these two compatibility -mechanisms. +Two-way compatibility with classic UNIX/glibc NSS user/group records is provided. +When using the Varlink API, lookups into databases provided only via +NSS (and not natively via Varlink) are handled by the `io.systemd.NameServiceSwitch` service (see above). +When using the NSS API (i.e. `getpwnam()` and friends) the `nss-systemd` module will automatically +synthesize NSS records for users/groups natively defined via a Varlink API. +Special care is taken to avoid recursion between these two compatibility mechanisms. Subsystems that shall provide user/group records to the system may choose between offering them via an NSS module or via a this Varlink API, either way -all records are accessible via both APIs, due to the bidirectional -forwarding. It is also possible to provide the same records via both APIs -directly, but in that case the compatibility logic must be turned off. There -are mechanisms in place for this, please contact the systemd project for +all records are accessible via both APIs, due to the bidirectional forwarding. +It is also possible to provide the same records via both APIs +directly, but in that case the compatibility logic must be turned off. +There are mechanisms in place for this, please contact the systemd project for details, as these are currently not documented. ## Caching of User Records -This API defines no concepts for caching records. If caching is desired it -should be implemented in the subsystems that provide the user records, not in -the clients consuming them. +This API defines no concepts for caching records. +If caching is desired it should be implemented in the subsystems that provide the user records, +not in the clients consuming them. ## Method Calls @@ -198,16 +192,15 @@ error ConflictingRecordFound() error EnumerationNotSupported() ``` -The `GetUserRecord` method looks up or enumerates a user record. If the `uid` -parameter is set it specifies the numeric UNIX UID to search for. If the -`userName` parameter is set it specifies the name of the user to search -for. Typically, only one of the two parameters are set, depending whether a -look-up by UID or by name is desired. However, clients may also specify both -parameters, in which case a record matching both will be returned, and if only -one exists that matches one of the two parameters but not the other an error of -`ConflictingRecordFound` is returned. If neither of the two parameters are set -the whole user database is enumerated. In this case the method call needs to be -made with `more` set, so that multiple method call replies may be generated as +The `GetUserRecord` method looks up or enumerates a user record. +If the `uid` parameter is set it specifies the numeric UNIX UID to search for. +If the `userName` parameter is set it specifies the name of the user to search for. +Typically, only one of the two parameters are set, depending whether a +look-up by UID or by name is desired. +However, clients may also specify both parameters, in which case a record matching both will be returned, and if only +one exists that matches one of the two parameters but not the other an error of `ConflictingRecordFound` is returned. +If neither of the two parameters are set the whole user database is enumerated. +In this case the method call needs to be made with `more` set, so that multiple method call replies may be generated as effect, each carrying one user record. The `service` parameter is mandatory and should be set to the service name @@ -217,13 +210,12 @@ of multiple services on the same socket (which is used by `systemd-userdbd.service`). The method call returns one or more user records, depending which type of query is -used (see above). The record is returned in the `record` field. The -`incomplete` field indicates whether the record is complete. Services providing -user record lookup should only pass the `privileged` section of user records to +used (see above). The record is returned in the `record` field. +The `incomplete` field indicates whether the record is complete. +Services providing user record lookup should only pass the `privileged` section of user records to clients that either match the user the record is about or to sufficiently -privileged clients, for all others the section must be removed so that no -sensitive data is leaked this way. The `incomplete` parameter should indicate -whether the record has been modified like this or not (i.e. it is `true` if a +privileged clients, for all others the section must be removed so that no sensitive data is leaked this way. +The `incomplete` parameter should indicate whether the record has been modified like this or not (i.e. it is `true` if a `privileged` section existed in the user record and was removed, and `false` if no `privileged` section existed or one existed but hasn't been removed). @@ -233,53 +225,48 @@ specified, and hence enumeration requested but the subsystem currently has no users defined). If a method call with an incorrectly set `service` field is received -(i.e. either not set at all, or not to the service's own name) a `BadService` -error is generated. Finally, `ServiceNotAvailable` should be returned when the -backing subsystem is not operational for some reason and hence no information -about existence or non-existence of a record can be returned nor any user -record at all. (The `service` field is defined in order to allow implementation -of daemons that provide multiple distinct user/group services over the same +(i.e. either not set at all, or not to the service's own name) a `BadService` error is generated. +Finally, `ServiceNotAvailable` should be returned when the backing subsystem is not operational for some reason and hence no information +about existence or non-existence of a record can be returned nor any user record at all. +(The `service` field is defined in order to allow implementation of daemons that provide multiple distinct user/group services over the same `AF_UNIX` socket: in order to correctly determine which service a client wants to talk to, the client needs to provide the name in each request.) The `GetGroupRecord` method call works analogously but for groups. -The `GetMemberships` method call may be used to inquire about group -memberships. The `userName` and `groupName` arguments take what the name -suggests. If one of the two is specified all matching memberships are returned, -if neither is specified all known memberships of any user and any group are -returned. The return value is a pair of user name and group name, where the -user is a member of the group. If both arguments are specified the specified -membership will be tested for, but no others, and the pair is returned if it is +The `GetMemberships` method call may be used to inquire about group memberships. +The `userName` and `groupName` arguments take what the name suggests. +If one of the two is specified all matching memberships are returned, +if neither is specified all known memberships of any user and any group are returned. +The return value is a pair of user name and group name, where the user is a member of the group. +If both arguments are specified the specified membership will be tested for, but no others, and the pair is returned if it is defined. Unless both arguments are specified the method call needs to be made with `more` set, so that multiple replies can be returned (since typically -there are multiple members per group and also multiple groups a user is -member of). As with `GetUserRecord` and `GetGroupRecord` the `service` +there are multiple members per group and also multiple groups a user is member of). +As with `GetUserRecord` and `GetGroupRecord` the `service` parameter needs to contain the name of the service being talked to, in order to -allow implementation of multiple services within the same IPC socket. In case no -matching membership is known `NoRecordFound` is returned. The other two errors -are also generated in the same cases as for `GetUserRecord` and +allow implementation of multiple services within the same IPC socket. +In case no matching membership is known `NoRecordFound` is returned. +The other two errors are also generated in the same cases as for `GetUserRecord` and `GetGroupRecord`. Unlike with `GetUserRecord` and `GetGroupRecord` the lists of memberships -returned by services are always combined. Thus unlike the other two calls a -membership lookup query has to wait for the last simultaneous query to complete +returned by services are always combined. +Thus unlike the other two calls a membership lookup query has to wait for the last simultaneous query to complete before the complete list is acquired. -Note that only the `GetMemberships` call is authoritative about memberships of -users in groups. i.e. it should not be considered sufficient to check the +Note that only the `GetMemberships` call is authoritative about memberships of users in groups. +i.e. it should not be considered sufficient to check the `memberOf` field of user records and the `members` field of group records to -acquire the full list of memberships. The full list can only be determined by -`GetMemberships`, and as mentioned requires merging of these lists of all local -services. Result of this is that it can be one service that defines a user A, -and another service that defines a group B, and a third service that declares -that A is a member of B. +acquire the full list of memberships. +The full list can only be determined by `GetMemberships`, and as mentioned requires merging of these lists of all local services. +Result of this is that it can be one service that defines a user A, +and another service that defines a group B, and a third service that declares that A is a member of B. Looking up explicit users/groups by their name or UID/GID, or querying -user/group memberships must be supported by all services implementing these -interfaces. However, supporting enumeration (i.e. user/group lookups that may -result in more than one reply, because neither UID/GID nor name is specified) -is optional. Services which are asked for enumeration may return the -`EnumerationNotSupported` error in this case. +user/group memberships must be supported by all services implementing these interfaces. +However, supporting enumeration (i.e. user/group lookups that may +result in more than one reply, because neither UID/GID nor name is specified) is optional. +Services which are asked for enumeration may return the `EnumerationNotSupported` error in this case. And that's really all there is to it. diff --git a/docs/USER_NAMES.md b/docs/USER_NAMES.md index 74c24b5..fe0ca7f 100644 --- a/docs/USER_NAMES.md +++ b/docs/USER_NAMES.md @@ -7,49 +7,45 @@ SPDX-License-Identifier: LGPL-2.1-or-later # User/Group Name Syntax -The precise set of allowed user and group names on Linux systems is weakly -defined. Depending on the distribution a different set of requirements and +The precise set of allowed user and group names on Linux systems is weakly defined. +Depending on the distribution a different set of requirements and restrictions on the syntax of user/group names are enforced — on some -distributions the accepted syntax is even configurable by the administrator. In -the interest of interoperability systemd enforces different rules when +distributions the accepted syntax is even configurable by the administrator. +In the interest of interoperability systemd enforces different rules when processing users/group defined by other subsystems and when defining users/groups -itself, following the principle of "Be conservative in what you send, be -liberal in what you accept". Also in the interest of interoperability systemd -will enforce the same rules everywhere and not make them configurable or -distribution dependent. The precise rules are described below. +itself, following the principle of "Be conservative in what you send, be liberal in what you accept". +Also in the interest of interoperability systemd will enforce the same rules everywhere and not make them configurable or distribution dependent. +The precise rules are described below. Generally, the same rules apply for user as for group names. ## Other Systems -* On POSIX the set of [valid user - names](https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_437) - is defined as [lower and upper case ASCII letters, digits, period, - underscore, and - hyphen](https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_282), - with the restriction that hyphen is not allowed as first character of the - user name. Interestingly no size limit is declared, i.e. in neither +* On POSIX the set of + [valid user names](https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_437) + is defined as + [lower and upper case ASCII letters, digits, period, underscore, and hyphen](https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_282), + with the restriction that hyphen is not allowed as first character of the user name. + Interestingly no size limit is declared, i.e. in neither direction, meaning that strictly speaking, according to POSIX, both the empty string is a valid user name as well as a string of gigabytes in length. -* Debian/Ubuntu based systems enforce the regular expression - `^[a-z][-a-z0-9]*$`, i.e. only lower case ASCII letters, digits and - hyphens. As first character only lowercase ASCII letters are allowed. This - regular expression is configurable by the administrator at runtime - though. This rule enforces a minimum length of one character but no maximum - length. +* Debian/Ubuntu based systems enforce the regular expression `^[a-z][-a-z0-9]*$`, i.e. + only lower case ASCII letters, digits and hyphens. + As first character only lowercase ASCII letters are allowed. + This regular expression is configurable by the administrator at runtime though. + This rule enforces a minimum length of one character but no maximum length. * Upstream shadow-utils enforces the regular expression - `^[a-z_][a-z0-9_-]*[$]$`, i.e. is similar to the Debian/Ubuntu rule, but - allows underscores and hyphens, but the latter not as first character. Also, - an optional trailing dollar character is permitted. + `^[a-z_][a-z0-9_-]*[$]$`, i.e.is similar to the Debian/Ubuntu rule, + but allows underscores and hyphens, but the latter not as first character. + Also, an optional trailing dollar character is permitted. * Fedora/Red Hat based systems enforce the regular expression of `^[a-zA-Z0-9_.][a-zA-Z0-9_.-]{0,30}[a-zA-Z0-9_.$-]?$`, i.e. a size limit of - 32 characters, with upper and lower case letters, digits, underscores, - hyphens and periods. No hyphen as first character though, and the last - character may be a dollar character. On top of that, `.` and `..` are not - allowed as user/group names. + 32 characters, with upper and lower case letters, digits, underscores, hyphens and periods. + No hyphen as first character though, and the last character may be a dollar character. + On top of that, `.` and `..` are not allowed as user/group names. * sssd is known to generate user names with embedded `@` and white-space characters, as well as non-ASCII (i.e. UTF-8) user/group names. @@ -58,16 +54,15 @@ Generally, the same rules apply for user as for group names. white-space characters, as well as non-ASCII (i.e. UTF-8) user/group names. Other operating systems enforce different rules; in this documentation we'll -focus on Linux systems only however, hence those are out of scope. That said, -software like Samba is frequently deployed on Linux for providing compatibility +focus on Linux systems only however, hence those are out of scope. +That said, software like Samba is frequently deployed on Linux for providing compatibility with Windows systems; on such systems it might be wise to stick to user/group names also valid according to Windows rules. ## Rules systemd enforces -Distilled from the above, below are the rules systemd enforces on user/group -names. An additional, common rule between both modes listed below is that empty -strings are not valid user/group names. +Distilled from the above, below are the rules systemd enforces on user/group names. +An additional, common rule between both modes listed below is that empty strings are not valid user/group names. Philosophically, the strict mode described below enforces an allow list of what's allowed and prohibits everything else, while the relaxed mode described @@ -83,18 +78,17 @@ or a regular user with [`systemd-homed.service`](https://www.freedesktop.org/software/systemd/man/systemd-homed.html). In strict mode, only uppercase and lowercase characters are allowed, as well as -digits, underscores and hyphens. The first character may not be a digit or -hyphen. A size limit is enforced: the minimum of `sysconf(_SC_LOGIN_NAME_MAX)` +digits, underscores and hyphens. +The first character may not be a digit or hyphen. A size limit is enforced: the minimum of `sysconf(_SC_LOGIN_NAME_MAX)` (typically 256 on Linux; rationale: this is how POSIX suggests to detect the limit), `UT_NAMESIZE-1` (typically 31 on Linux; rationale: names longer than this cannot correctly appear in `utmp`/`wtmp` and create ambiguity with login accounting) and `NAME_MAX` (255 on Linux; rationale: user names typically -appear in directory names, i.e. the home directory), thus MIN(256, 31, 255) = -31. +appear in directory names, i.e. the home directory), thus MIN(256, 31, 255) = 31. Note that these rules are both more strict and more relaxed than all of the -rules enforced by other systems listed above. A user/group name conforming to -systemd's strict rules will not necessarily pass a test by the rules enforced +rules enforced by other systems listed above. +A user/group name conforming to systemd's strict rules will not necessarily pass a test by the rules enforced by these other subsystems. Written as regular expression the above is: `^[a-zA-Z_][a-zA-Z0-9_-]{0,30}$` @@ -107,8 +101,8 @@ components of the system, for example in [`systemd-logind.service`](https://www.freedesktop.org/software/systemd/man/systemd-logind.html). Relaxed syntax is also enforced by the `User=` setting in service unit files, -i.e. for system services used for running services. Since these users may be -registered by a variety of tools relaxed mode is used, but since the primary +i.e. for system services used for running services. +Since these users may be registered by a variety of tools relaxed mode is used, but since the primary purpose of these users is to run a system service and thus a job for systemd a warning is shown if the specified user name does not qualify by the strict rules above. @@ -150,16 +144,15 @@ Note that these relaxed rules are implied by the strict rules above, i.e. all user/group names accepted by the strict rules are also accepted by the relaxed rules, but not vice versa. -Note that this relaxed mode does not refuse a couple of very questionable -syntaxes. For example, it permits a leading or embedded period. A leading period -is problematic because the matching home directory would typically be hidden -from the user's/administrator's view. An embedded period is problematic since -it creates ambiguity in traditional `chown` syntax (which is still accepted +Note that this relaxed mode does not refuse a couple of very questionable syntaxes. +For example, it permits a leading or embedded period. +A leading period is problematic because the matching home directory would typically be hidden +from the user's/administrator's view. +An embedded period is problematic since it creates ambiguity in traditional `chown` syntax (which is still accepted today) that uses it to separate user and group names in the command's parameter: without consulting the user/group databases it is not possible to -determine if a `chown` invocation would change just the owning user or both the -owning user and group. It also allows embedding `@` (which is confusing to -MTAs). +determine if a `chown` invocation would change just the owning user or both the owning user and group. +It also allows embedding `@` (which is confusing to MTAs). ## Common Core diff --git a/docs/USER_RECORD.md b/docs/USER_RECORD.md index 8cfb053..5d43de5 100644 --- a/docs/USER_RECORD.md +++ b/docs/USER_RECORD.md @@ -15,7 +15,7 @@ pairs, encoded as JSON. Specifically: 1. [`systemd-homed.service`](https://www.freedesktop.org/software/systemd/man/systemd-homed.service.html) manages `human` user home directories and embeds these JSON records directly in the home directory images - (see [Home Directories](HOME_DIRECTORY) for details). + (see [Home Directories](/HOME_DIRECTORY) for details). 2. [`pam_systemd`](https://www.freedesktop.org/software/systemd/man/pam_systemd.html) processes these JSON records for users that log in, and applies various @@ -72,11 +72,11 @@ the following extensions are envisioned: 4. Default parameters for backup applications and similar Similar to JSON User Records there are also -[JSON Group Records](GROUP_RECORD) that encapsulate UNIX groups. +[JSON Group Records](/GROUP_RECORD) that encapsulate UNIX groups. JSON User Records may be transferred or written to disk in various protocols and formats. To inquire about such records defined on the local system use the -[User/Group Lookup API via Varlink](USER_GROUP_API). User/group records may +[User/Group Lookup API via Varlink](/USER_GROUP_API). User/group records may also be dropped in number of drop-in directories as files. See [`nss-systemd(8)`](https://www.freedesktop.org/software/systemd/man/nss-systemd.html) for details. @@ -214,7 +214,7 @@ object. The following fields are currently defined: UNIX user name. This field is the only mandatory field, all others are optional. Corresponds with the `pw_name` field of `struct passwd` and the `sp_namp` field of `struct spwd` (i.e. the shadow user record stored in -`/etc/shadow`). See [User/Group Name Syntax](USER_NAMES) for +`/etc/shadow`). See [User/Group Name Syntax](/USER_NAMES) for the (relaxed) rules the various systemd components enforce on user/group names. `realm` → The "realm" a user is defined in. This concept allows distinguishing diff --git a/docs/VIRTUALIZED_TESTING.md b/docs/VIRTUALIZED_TESTING.md index 94a5606..f419c49 100644 --- a/docs/VIRTUALIZED_TESTING.md +++ b/docs/VIRTUALIZED_TESTING.md @@ -15,7 +15,7 @@ Here's a nice hack if you regularly build and test-boot systemd, are gutsy enoug Create a shell script like this: -``` +```sh #!/bin/sh sudo sync @@ -26,16 +26,22 @@ sudo modprobe kvm-intel exec sudo qemu-kvm -smp 2 -m 512 -snapshot /dev/sda ``` -This will boot your local host system as a throw-away VM guest. It will take your main harddisk, boot from it in the VM, allow changes to it, but these changes are all just buffered in memory and never hit the real disk. Any changes made in this VM will be lost when the VM terminates. I have called this script "q", and hence for test booting my own system all I do is type the following command in my systemd source tree and I can see if it worked. +This will boot your local host system as a throw-away VM guest. +It will take your main harddisk, boot from it in the VM, allow changes to it, but these changes are all just buffered in memory and never hit the real disk. +Any changes made in this VM will be lost when the VM terminates. +I have called this script "q", and hence for test booting my own system all I do is type the following command in my systemd source tree and I can see if it worked. ``` $ make -j10 && sudo make install && q - ``` -The first three lines are necessary to ensure that the kernel's disk caches are all synced to disk before qemu takes the snapshot of it. Yes, invoking "umount /" will sync your file system to disk as a side effect, even though it will actually fail. When the machine boots up the file system will still be marked dirty (and hence you will get an fsck, usually), but it will work fine nonetheless in virtually all cases. +The first three lines are necessary to ensure that the kernel's disk caches are all synced to disk before qemu takes the snapshot of it. +Yes, invoking "umount /" will sync your file system to disk as a side effect, even though it will actually fail. +When the machine boots up the file system will still be marked dirty (and hence you will get an fsck, usually), but it will work fine nonetheless in virtually all cases. -Of course, if the host's hard disk changes while the VM is running this will be visible to the VM, and might confuse it. If you use this little hack you should keep changes on the host at a minimum, hence. Yeah this all is a hack, but a really useful and neat one. +Of course, if the host's hard disk changes while the VM is running this will be visible to the VM, and might confuse it. +If you use this little hack you should keep changes on the host at a minimum, hence. +Yeah this all is a hack, but a really useful and neat one. YMMV if you use LVM or btrfs. @@ -43,13 +49,14 @@ YMMV if you use LVM or btrfs. Test-booting systemd in a container has the benefit of being much easier to debug/instrument from the outside. -**Important**: As preparation it is essential to turn off auditing entirely on your system. Auditing is broken with containers, and will trigger all kinds of error in containers if not turned off. Use `audit=0` on the host's kernel command line to turn it off. +**Important**: As preparation it is essential to turn off auditing entirely on your system. +Auditing is broken with containers, and will trigger all kinds of error in containers if not turned off. +Use `audit=0` on the host's kernel command line to turn it off. Then, as the first step I install Fedora into a container tree: ``` $ sudo yum -y --releasever=20 --installroot=$HOME/fedora-tree --disablerepo='*' --enablerepo=fedora install systemd passwd yum fedora-release vim-minimal - ``` You can do something similar with debootstrap on a Debian system. Now, we need to set a root password in order to be able to log in: @@ -65,14 +72,12 @@ As the next step we can already boot the container: ``` $ sudo systemd-nspawn -bD ~/fedora-tree/ 3 - ``` To test systemd in the container I then run this from my source tree on the host: ``` $ make -j10 && sudo DESTDIR=$HOME/fedora-tree make install && sudo systemd-nspawn -bD ~/fedora-tree/ 3 - ``` And that's already it. diff --git a/docs/WRITING_DESKTOP_ENVIRONMENTS.md b/docs/WRITING_DESKTOP_ENVIRONMENTS.md index b50c857..774308d 100644 --- a/docs/WRITING_DESKTOP_ENVIRONMENTS.md +++ b/docs/WRITING_DESKTOP_ENVIRONMENTS.md @@ -9,22 +9,41 @@ SPDX-License-Identifier: LGPL-2.1-or-later _Or: how to hook up your favorite desktop environment with logind_ -systemd's logind service obsoletes ConsoleKit which was previously widely used on Linux distributions. This provides a number of new features, but also requires updating of the Desktop Environment running on it, in a few ways. +systemd's logind service obsoletes ConsoleKit which was previously widely used on Linux distributions. +This provides a number of new features, but also requires updating of the Desktop Environment running on it, in a few ways. -This document should be read together with [Writing Display Managers](http://www.freedesktop.org/wiki/Software/systemd/writing-display-managers) which focuses on the porting work necessary for display managers. +This document should be read together with [Writing Display Managers](/WRITING_DISPLAY_MANAGERS) which focuses on the porting work necessary for display managers. -If required it is possible to implement ConsoleKit and systemd-logind support in the same desktop environment code, detecting at runtime which interface is needed. The [sd_booted()](http://www.freedesktop.org/software/systemd/man/sd_booted.html) call may be used to determine at runtime whether systemd is used. +If required it is possible to implement ConsoleKit and systemd-logind support in the same desktop environment code, detecting at runtime which interface is needed. +The [sd_booted()](http://www.freedesktop.org/software/systemd/man/sd_booted.html) call may be used to determine at runtime whether systemd is used. To a certain level ConsoleKit and systemd-logind may be used side-by-side, but a number of features are not available if ConsoleKit is used. -Please have a look at the [Bus API of logind](http://www.freedesktop.org/wiki/Software/systemd/logind) and the C API as documented in [sd-login(7)](http://www.freedesktop.org/software/systemd/man/sd-login.html). (Also see below) +Please have a look at the [Bus API of logind](https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.login1.html) and the C API as documented in [sd-login(7)](http://www.freedesktop.org/software/systemd/man/sd-login.html). (Also see below) Here are the suggested changes: -- Your session manager should listen to "Lock" and "Unlock" messages that are emitted from the session object logind exposes for your DE session, on the system bus. If "Lock" is received the screen lock should be activated, if "Unlock" is received it should be deactivated. This can easily be tested with "loginctl lock-sessions". See the [Bus API of logind](http://www.freedesktop.org/wiki/Software/systemd/logind) for further details. -- Whenever the session gets idle the DE should invoke the SetIdleHint(True) call on the respective session object on the session bus. This is necessary for the system to implement auto-suspend when all sessions are idle. If the session gets used again it should call SetIdleHint(False). A session should be considered idle if it didn't receive user input (mouse movements, keyboard) in a while. See the [Bus API of logind](http://www.freedesktop.org/wiki/Software/systemd/logind) for further details. -- To reboot/power-off/suspend/hibernate the machine from the DE use logind's bus calls Reboot(), PowerOff(), Suspend(), Hibernate(), HybridSleep(). For further details see [Bus API of logind](http://www.freedesktop.org/wiki/Software/systemd/logind). -- If your session manager handles the special power, suspend, hibernate hardware keys or the laptop lid switch on its own it is welcome to do so, but needs to disable logind's built-in handling of these events. Take one or more of the _handle-power-key_, _handle-suspend-key_, _handle-hibernate-key_, _handle-lid-switch_ inhibitor locks for that. See [Inhibitor Locks](http://www.freedesktop.org/wiki/Software/systemd/inhibit) for further details on this. -- Before rebooting/powering-off/suspending/hibernating and when the operation is triggered by the user by clicking on some UI elements (or suchlike) it is recommended to show the list of currently active inhibitors for the operation, and ask the user to acknowledge the operation. Note that PK often allows the user to execute the operation ignoring the inhibitors. Use logind's ListInhibitors() call to get a list of these inhibitors. See [Inhibitor Locks](http://www.freedesktop.org/wiki/Software/systemd/inhibit) for further details on this. -- If your DE contains a process viewer of some kind ("system monitor") it's a good idea to show session, service and seat information for each process. Use sd_pid_get_session(), sd_pid_get_unit(), sd_session_get_seat() to determine these. For details see [sd-login(7)](http://www.freedesktop.org/software/systemd/man/sd-login.html). - And that's all! Thank you! +- Your session manager should listen to "Lock" and "Unlock" messages that are emitted from the session object logind exposes for your DE session, on the system bus. + If "Lock" is received the screen lock should be activated, if "Unlock" is received it should be deactivated. + This can easily be tested with "loginctl lock-sessions". + See the [Bus API of logind](https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.login1.html) for further details. +- Whenever the session gets idle the DE should invoke the SetIdleHint(True) call on the respective session object on the session bus. + This is necessary for the system to implement auto-suspend when all sessions are idle. + If the session gets used again it should call SetIdleHint(False). + A session should be considered idle if it didn't receive user input (mouse movements, keyboard) in a while. + See the [Bus API of logind](https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.login1.html) for further details. +- To reboot/power-off/suspend/hibernate the machine from the DE use logind's bus calls Reboot(), PowerOff(), Suspend(), Hibernate(), HybridSleep(). + For further details see [Bus API of logind](https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.login1.html). +- If your session manager handles the special power, suspend, hibernate hardware keys or the laptop lid switch on its own it is welcome to do so, + but needs to disable logind's built-in handling of these events. + Take one or more of the _handle-power-key_, _handle-suspend-key_, _handle-hibernate-key_, _handle-lid-switch_ inhibitor locks for that. + See [Inhibitor Locks](/INHIBITOR_LOCKS) for further details on this. +- Before rebooting/powering-off/suspending/hibernating and when the operation is triggered by the user by clicking on some UI elements + (or suchlike) it is recommended to show the list of currently active inhibitors for the operation, and ask the user to acknowledge the operation. + Note that PK often allows the user to execute the operation ignoring the inhibitors. + Use logind's ListInhibitors() call to get a list of these inhibitors. See [Inhibitor Locks](/INHIBITOR_LOCKS) for further details on this. +- If your DE contains a process viewer of some kind ("system monitor") it's a good idea to show session, service and seat information for each process. + Use sd_pid_get_session(), sd_pid_get_unit(), sd_session_get_seat() to determine these. + For details see [sd-login(7)](http://www.freedesktop.org/software/systemd/man/sd-login.html). + +And that's all! Thank you! diff --git a/docs/WRITING_DISPLAY_MANAGERS.md b/docs/WRITING_DISPLAY_MANAGERS.md index efdbccc..467e8a8 100644 --- a/docs/WRITING_DISPLAY_MANAGERS.md +++ b/docs/WRITING_DISPLAY_MANAGERS.md @@ -9,31 +9,51 @@ SPDX-License-Identifier: LGPL-2.1-or-later _Or: How to hook up your favorite X11 display manager with systemd_ -systemd's logind service obsoletes ConsoleKit which was previously widely used on Linux distributions. For X11 display managers the switch to logind requires a minimal amount of porting, however brings a couple of new features: true automatic multi-seat support, proper tracking of session processes, (optional) automatic killing of user processes on logout, a synchronous low-level C API and much simplification. +systemd's logind service obsoletes ConsoleKit which was previously widely used on Linux distributions. +For X11 display managers the switch to logind requires a minimal amount of porting, however brings a couple of new features: +true automatic multi-seat support, proper tracking of session processes, (optional) automatic killing of user processes on logout, a synchronous low-level C API and much simplification. -This document should be read together with [Writing Desktop Environments](http://www.freedesktop.org/wiki/Software/systemd/writing-desktop-environments) which focuses on the porting work necessary for desktop environments. +This document should be read together with [Writing Desktop Environments](/WRITING_DESKTOP_ENVIRONMENTS) which focuses on the porting work necessary for desktop environments. -If required it is possible to implement ConsoleKit and systemd-logind support in the same display manager, detecting at runtime which interface is needed. The [sd_booted()](http://www.freedesktop.org/software/systemd/man/sd_booted.html) call may be used to determine at runtime whether systemd is used. +If required it is possible to implement ConsoleKit and systemd-logind support in the same display manager, detecting at runtime which interface is needed. +The [sd_booted()](http://www.freedesktop.org/software/systemd/man/sd_booted.html) call may be used to determine at runtime whether systemd is used. To a certain level ConsoleKit and systemd-logind may be used side-by-side, but a number of features are not available if ConsoleKit is used, for example automatic multi-seat support. -Please have a look at the [Bus API of logind](http://www.freedesktop.org/wiki/Software/systemd/logind) and the C API as documented in [sd-login(7)](http://www.freedesktop.org/software/systemd/man/sd-login.html). (Also see below) +Please have a look at the [Bus API of logind](https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.login1.html) and the C API as documented in [sd-login(7)](http://www.freedesktop.org/software/systemd/man/sd-login.html). +(Also see below) Minimal porting (without multi-seat) requires the following: 1. Remove/disable all code responsible for registering your service with ConsoleKit. -2. Make sure to register your greeter session via the PAM session stack, and make sure the PAM session modules include pam_systemd. Also, make sure to set the session class to "greeter." This may be done by setting the environment variable XDG_SESSION_CLASS to "greeter" with pam_misc_setenv() or setting the "class=greeter" option in the pam_systemd module, in order to allow applications to filter out greeter sessions from normal login sessions. +2. Make sure to register your greeter session via the PAM session stack, and make sure the PAM session modules include pam_systemd. + Also, make sure to set the session class to "greeter." This may be done by setting the environment variable XDG_SESSION_CLASS to "greeter" with pam_misc_setenv() or setting the "class=greeter" option in the pam_systemd module, in order to allow applications to filter out greeter sessions from normal login sessions. 3. Make sure to register your logged in session via the PAM session stack as well, also including pam_systemd in it. -4. Optionally, use pam_misc_setenv() to set the environment variables XDG_SEAT and XDG_VTNR. The former should contain "seat0", the latter the VT number your session runs on. pam_systemd can determine these values automatically but it's nice to pass these variables anyway. - In summary: porting a display manager from ConsoleKit to systemd primarily means removing code, not necessarily adding any new code. Here, a cheers to simplicity! - -Complete porting (with multi-seat) requires the following (Before you continue, make sure to read up on [Multi-Seat on Linux](http://www.freedesktop.org/wiki/Software/systemd/multiseat) first.): - -1. Subscribe to seats showing up and going away, via the systemd-logind D-Bus interface's SeatAdded and SeatRemoved signals. Take possession of each seat by spawning your greeter on it. However, do so exclusively for seats where the boolean CanGraphical property is true. Note that there are seats that cannot do graphical, and there are seats that are text-only first, and gain graphical support later on. Most prominently this is actually seat0 which comes up in text mode, and where the graphics driver is then loaded and probed during boot. This means display managers must watch PropertyChanged events on all seats, to see if they gain (or lose) the CanGraphical field. +4. Optionally, use pam_misc_setenv() to set the environment variables XDG_SEAT and XDG_VTNR. + The former should contain "seat0", the latter the VT number your session runs on. pam_systemd can determine these values automatically but it's nice to pass these variables anyway. +In summary: porting a display manager from ConsoleKit to systemd primarily means removing code, not necessarily adding any new code. Here, a cheers to simplicity! + +Complete porting (with multi-seat) requires the following (Before you continue, make sure to read up on [Multi-Seat on Linux](https://www.freedesktop.org/wiki/Software/systemd/multiseat) first.): + +1. Subscribe to seats showing up and going away, via the systemd-logind D-Bus interface's SeatAdded and SeatRemoved signals. + Take possession of each seat by spawning your greeter on it. + However, do so exclusively for seats where the boolean CanGraphical property is true. + Note that there are seats that cannot do graphical, and there are seats that are text-only first, and gain graphical support later on. + Most prominently this is actually seat0 which comes up in text mode, and where the graphics driver is then loaded and probed during boot. + This means display managers must watch PropertyChanged events on all seats, to see if they gain (or lose) the CanGraphical field. 2. Use ListSeats() on the D-Bus interface to acquire a list of already available seats and also take possession of them. -3. For each seat you spawn a greeter/user session on use the XDG_SEAT and XDG_VTNR PAM environment variables to inform pam_systemd about the seat name, resp. VT number you start them on. Note that only the special seat "seat0" actually knows kernel VTs, so you shouldn't pass the VT number on any but the main seat, since it doesn't make any sense there. +3. For each seat you spawn a greeter/user session on use the XDG_SEAT and XDG_VTNR PAM environment variables to inform pam_systemd about the seat name, resp. + VT number you start them on. Note that only the special seat "seat0" actually knows kernel VTs, so you shouldn't pass the VT number on any but the main seat, since it doesn't make any sense there. 4. Pass the seat name to the X server you start via the -seat parameter. -5. At this time X interprets the -seat parameter natively only for input devices, not for graphics devices. To work around this limitation we provide a tiny wrapper /lib/systemd/systemd-multi-seat-x which emulates the enumeration for graphics devices too. This wrapper will eventually go away, as soon as X learns udev-based graphics device enumeration natively, instead of the current PCI based one. Hence it is a good idea to fall back to the real X when this wrapper is not found. You may use this wrapper exactly like the real X server, and internally it will just exec() it after putting together a minimal multi-seat configuration. +5. At this time X interprets the -seat parameter natively only for input devices, not for graphics devices. + To work around this limitation we provide a tiny wrapper /lib/systemd/systemd-multi-seat-x which emulates the enumeration for graphics devices too. + This wrapper will eventually go away, as soon as X learns udev-based graphics device enumeration natively, instead of the current PCI based one. + Hence it is a good idea to fall back to the real X when this wrapper is not found. + You may use this wrapper exactly like the real X server, and internally it will just exec() it after putting together a minimal multi-seat configuration. And that's already it. -While most information about seats, sessions and users is available on systemd-logind's D-Bus interface, this is not the only API. The synchronous [sd-login(7)](http://www.freedesktop.org/software/systemd/man/sd-login.html) C interface is often easier to use and much faster too. In fact it is possible to implement the scheme above entirely without D-Bus relying only on this API. Note however, that this C API is purely passive, and if you want to execute an actually state changing operation you need to use the bus interface (for example, to switch sessions, or to kill sessions and suchlike). Also have a look at the [logind Bus API](http://www.freedesktop.org/wiki/Software/systemd/logind). +While most information about seats, sessions and users is available on systemd-logind's D-Bus interface, this is not the only API. +The synchronous [sd-login(7)](http://www.freedesktop.org/software/systemd/man/sd-login.html) C interface is often easier to use and much faster too. +In fact it is possible to implement the scheme above entirely without D-Bus relying only on this API. +Note however, that this C API is purely passive, and if you want to execute an actually state changing operation you need to use the bus interface (for example, to switch sessions, or to kill sessions and suchlike). +Also have a look at the [logind Bus API](https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.login1.html). diff --git a/docs/WRITING_NETWORK_CONFIGURATION_MANAGERS.md b/docs/WRITING_NETWORK_CONFIGURATION_MANAGERS.md index 3a02c3a..7220765 100644 --- a/docs/WRITING_NETWORK_CONFIGURATION_MANAGERS.md +++ b/docs/WRITING_NETWORK_CONFIGURATION_MANAGERS.md @@ -9,12 +9,20 @@ SPDX-License-Identifier: LGPL-2.1-or-later _Or: How to hook up your favourite network configuration manager's DNS logic with `systemd-resolved`_ -_(This is a longer explanation how to use some parts of `systemd-resolved` bus API. If you are just looking for an API reference, consult the [bus API documentation](https://wiki.freedesktop.org/www/Software/systemd/resolved/) instead.)_ - -Since systemd 229 `systemd-resolved` offers a powerful bus API that may be used by network configuration managers (e.g. NetworkManager, connman, …, but also lower level DHCP, VPN or PPP daemons managing specific interfaces) to pass DNS server and DNSSEC configuration directly to `systemd-resolved`. Note that `systemd-resolved` also reads the DNS configuration data in `/etc/resolv.conf`, for compatibility. However, by passing the DNS configuration directly to `systemd-resolved` via the bus a couple of benefits are available: - -1. `systemd-resolved` maintains DNS configuration per-interface, instead of simply system-wide, and is capable of sending DNS requests to servers on multiple different network interfaces simultaneously, returning the first positive response (or if all fail, the last negative one). This allows effective "merging" of DNS views on different interfaces, which makes private DNS zones on multi-homed hosts a lot nicer to use. For example, if you are connected to a LAN and a VPN, and both have private DNS zones, then you will be able to resolve both, as long as they don't clash in names. By using the bus API to configure DNS settings, the per-interface configuration is opened up. -2. Per-link configuration of DNSSEC is available. This is particularly interesting for network configuration managers that implement captive portal detection: as long as a verified connection to the Internet is not found DNSSEC should be turned off (as some captive portal systems alter the DNS in order to redirect clients to their internal pages). +_(This is a longer explanation how to use some parts of `systemd-resolved` bus API. If you are just looking for an API reference, consult the [bus API documentation](https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.resolve1.html) instead.)_ + +Since systemd 229 `systemd-resolved` offers a powerful bus API that may be used by network configuration managers (e.g. NetworkManager, connman, …, but also lower level DHCP, VPN or PPP daemons managing specific interfaces) to pass DNS server and DNSSEC configuration directly to `systemd-resolved`. +Note that `systemd-resolved` also reads the DNS configuration data in `/etc/resolv.conf`, for compatibility. However, by passing the DNS configuration directly to `systemd-resolved` via the bus a couple of benefits are available: + +1. `systemd-resolved` maintains DNS configuration per-interface, instead of simply system-wide, + and is capable of sending DNS requests to servers on multiple different network interfaces simultaneously, returning the first positive response + (or if all fail, the last negative one). + This allows effective "merging" of DNS views on different interfaces, which makes private DNS zones on multi-homed hosts a lot nicer to use. + For example, if you are connected to a LAN and a VPN, and both have private DNS zones, then you will be able to resolve both, as long as they don't clash in names. + By using the bus API to configure DNS settings, the per-interface configuration is opened up. +2. Per-link configuration of DNSSEC is available. This is particularly interesting for network configuration managers that implement captive portal detection: + as long as a verified connection to the Internet is not found DNSSEC should be turned off + (as some captive portal systems alter the DNS in order to redirect clients to their internal pages). 3. Per-link configuration of LLMNR and MulticastDNS is available. 4. In contrast to changes to `/etc/resolv.conf` all changes made via the bus take effect immediately for all future lookups. 5. Statistical data about executed DNS transactions is available, as well as information about whether DNSSEC is supported on the chosen DNS server. @@ -23,7 +31,11 @@ Note that `systemd-networkd` is already hooked up with `systemd-resolved`, expos ## Suggested Mode of Operation -Whenever a network configuration manager sets up an interface for operation, it should pass the DNS configuration information for the interface to `systemd-resolved`. It's recommended to do that after the Linux network interface index ("ifindex") has been allocated, but before the interface has been upped (i.e. `IFF_UP` turned on). That way, `systemd-resolved` will be able to use the configuration the moment the network interface is available. (Note that `systemd-resolved` watches the kernel interfaces come and go, and will make use of them as soon as they are suitable to be used, which among other factors requires `IFF_UP` to be set). That said it is OK to change DNS configuration dynamically any time: simply pass the new data to resolved, and it is happy to use it. +Whenever a network configuration manager sets up an interface for operation, it should pass the DNS configuration information for the interface to `systemd-resolved`. +It's recommended to do that after the Linux network interface index ("ifindex") has been allocated, but before the interface has been upped (i.e. `IFF_UP` turned on). +That way, `systemd-resolved` will be able to use the configuration the moment the network interface is available. +(Note that `systemd-resolved` watches the kernel interfaces come and go, and will make use of them as soon as they are suitable to be used, which among other factors requires `IFF_UP` to be set). +That said it is OK to change DNS configuration dynamically any time: simply pass the new data to resolved, and it is happy to use it. In order to pass the DNS configuration information to resolved, use the following methods of the `org.freedesktop.resolve1.Manager` interface of the `/org/freedesktop/resolve1` object, on the `org.freedesktop.resolve1` service: @@ -33,19 +45,31 @@ In order to pass the DNS configuration information to resolved, use the followin 4. To configure DNSSEC Negative Trust Anchors (NTAs, i.e. domains for which not to do DNSSEC validation), use `SetLinkDNSSECNegativeTrustAnchors()` 5. To configure the LLMNR and MulticastDNS mode, use `SetLinkLLMNR()` and `SetLinkMulticastDNS()` -For details about these calls see the [full resolved bus API documentation](https://wiki.freedesktop.org/www/Software/systemd/resolved/). +For details about these calls see the [full resolved bus API documentation](https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.resolve1.html). -The calls should be pretty obvious to use: they simply take an interface index and the parameters to set. IP addresses are encoded as an address family specifier (an integer, that takes the usual `AF_INET` and `AF_INET6` constants), followed by a 4 or 16 byte array with the address in network byte order. +The calls should be pretty obvious to use: they simply take an interface index and the parameters to set. +IP addresses are encoded as an address family specifier (an integer, that takes the usual `AF_INET` and `AF_INET6` constants), followed by a 4 or 16 byte array with the address in network byte order. -`systemd-resolved` distinguishes between "search" and "routing" domains. Routing domains are used to route DNS requests of specific domains to particular interfaces. i.e. requests for a hostname `foo.bar.com` will be routed to any interface that has `bar.com` as routing domain. The same routing domain may be defined on multiple interfaces, in which case the request is routed to all of them in parallel. Resolver requests for hostnames that do not end in any defined routing domain of any interface will be routed to all suitable interfaces. Search domains work like routing domain, but are also used to qualify single-label domain names. They hence are identical to the traditional search domain logic on UNIX. The `SetLinkDomains()` call may used to define both search and routing domains. +`systemd-resolved` distinguishes between "search" and "routing" domains. +Routing domains are used to route DNS requests of specific domains to particular interfaces. +i.e. requests for a hostname `foo.bar.com` will be routed to any interface that has `bar.com` as routing domain. +The same routing domain may be defined on multiple interfaces, in which case the request is routed to all of them in parallel. +Resolver requests for hostnames that do not end in any defined routing domain of any interface will be routed to all suitable interfaces. +Search domains work like routing domain, but are also used to qualify single-label domain names. +They hence are identical to the traditional search domain logic on UNIX. +The `SetLinkDomains()` call may used to define both search and routing domains. -The most basic support of `systemd-resolved` in a network configuration manager would be to simply invoke `SetLinkDNS()` and `SetLinkDomains()` for the specific interface index with the data traditionally written to `/etc/resolv.conf`. More advanced integration could mean the network configuration manager also makes the DNSSEC mode, the DNSSEC NTAs and the LLMNR/MulticastDNS modes available for configuration. +The most basic support of `systemd-resolved` in a network configuration manager would be to simply invoke `SetLinkDNS()` and `SetLinkDomains()` for the specific interface index with the data traditionally written to `/etc/resolv.conf`. +More advanced integration could mean the network configuration manager also makes the DNSSEC mode, the DNSSEC NTAs and the LLMNR/MulticastDNS modes available for configuration. It is strongly recommended for network configuration managers that implement captive portal detection to turn off DNSSEC validation during the detection phase, so that captive portals that modify DNS do not result in all DNSSEC look-ups to fail. -If a network configuration manager wants to reset specific settings to the defaults (such as the DNSSEC, LLMNR or MulticastDNS mode), it may simply call the function with an empty argument. To reset all per-link changes it made it may call `RevertLink()`. +If a network configuration manager wants to reset specific settings to the defaults (such as the DNSSEC, LLMNR or MulticastDNS mode), it may simply call the function with an empty argument. +To reset all per-link changes it made it may call `RevertLink()`. -To read back the various settings made, use `GetLink()` to get a `org.freedesktop.resolve1.Link` object for a specific network interface. It exposes the current settings in its bus properties. See the [full bus API documentation](https://wiki.freedesktop.org/www/Software/systemd/resolved/) for details on this. +To read back the various settings made, use `GetLink()` to get a `org.freedesktop.resolve1.Link` object for a specific network interface. +It exposes the current settings in its bus properties. +See the [full bus API documentation](https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.resolve1.html) for details on this. In order to translate a network interface name to an interface index, use the usual glibc `if_nametoindex()` call. @@ -55,10 +79,16 @@ Note that it is fully OK if multiple different daemons push DNS configuration da ## Handling of `/etc/resolv.conf` -`systemd-resolved` receives DNS configuration from a number of sources, via the bus, as well as directly from `systemd-networkd` or user configuration. It uses this data to write a file that is compatible with the traditional Linux `/etc/resolv.conf` file. This file is stored in `/run/systemd/resolve/resolv.conf`. It is recommended to symlink `/etc/resolv.conf` to this file, in order to provide compatibility with programs reading the file directly and not going via the NSS and thus `systemd-resolved`. +`systemd-resolved` receives DNS configuration from a number of sources, via the bus, as well as directly from `systemd-networkd` or user configuration. +It uses this data to write a file that is compatible with the traditional Linux `/etc/resolv.conf` file. +This file is stored in `/run/systemd/resolve/resolv.conf`. It is recommended to symlink `/etc/resolv.conf` to this file, in order to provide compatibility with programs reading the file directly and not going via the NSS and thus `systemd-resolved`. -For network configuration managers it is recommended to rely on this resolved-provided mechanism to update `resolv.conf`. Specifically, the network configuration manager should stop modifying `/etc/resolv.conf` directly if it notices it being a symlink to `/run/systemd/resolve/resolv.conf`. +For network configuration managers it is recommended to rely on this resolved-provided mechanism to update `resolv.conf`. +Specifically, the network configuration manager should stop modifying `/etc/resolv.conf` directly if it notices it being a symlink to `/run/systemd/resolve/resolv.conf`. -If a system configuration manager desires to be compatible both with systems that use `systemd-resolved` and those which do not, it is recommended to first push any discovered DNS configuration into `systemd-resolved`, and deal gracefully with `systemd-resolved` not being available on the bus. If `/etc/resolv.conf` is a not a symlink to `/run/systemd/resolve/resolv.conf` the manager may then proceed and also update `/etc/resolv.conf`. With this mode of operation optimal compatibility is provided, as `systemd-resolved` is used for `/etc/resolv.conf` management when this is configured, but transparent compatibility with non-`systemd-resolved` systems is maintained. Note that `systemd-resolved` is part of systemd, and hence likely to be pretty universally available on Linux systems soon. +If a system configuration manager desires to be compatible both with systems that use `systemd-resolved` and those which do not, it is recommended to first push any discovered DNS configuration into `systemd-resolved`, and deal gracefully with `systemd-resolved` not being available on the bus. +If `/etc/resolv.conf` is a not a symlink to `/run/systemd/resolve/resolv.conf` the manager may then proceed and also update `/etc/resolv.conf`. +With this mode of operation optimal compatibility is provided, as `systemd-resolved` is used for `/etc/resolv.conf` management when this is configured, but transparent compatibility with non-`systemd-resolved` systems is maintained. +Note that `systemd-resolved` is part of systemd, and hence likely to be pretty universally available on Linux systems soon. By allowing `systemd-resolved` to manage `/etc/resolv.conf` ownership issues regarding different programs overwriting each other's DNS configuration are effectively removed. diff --git a/docs/WRITING_RESOLVER_CLIENTS.md b/docs/WRITING_RESOLVER_CLIENTS.md index 88a873a..93c51c5 100644 --- a/docs/WRITING_RESOLVER_CLIENTS.md +++ b/docs/WRITING_RESOLVER_CLIENTS.md @@ -7,41 +7,53 @@ SPDX-License-Identifier: LGPL-2.1-or-later # Writing Resolver Clients -_Or: How to look up hostnames and arbitrary DNS Resource Records via \_systemd-resolved_'s bus APIs\_ +_Or: How to look up hostnames and arbitrary DNS Resource Records via_ `systemd-resolved` _'s bus APIs_ -_(This is a longer explanation how to use some parts of \_systemd-resolved_ bus API. If you are just looking for an API reference, consult the [bus API documentation](https://wiki.freedesktop.org/www/Software/systemd/resolved/) instead.)\_ +_(This is a longer explanation how to use some parts of_ `systemd-resolved` _bus API. If you are just looking for an API reference, consult the [bus API documentation](https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.resolve1.html) instead.)_ -_systemd-resolved_ provides a set of APIs on the bus for resolving DNS resource records. These are: +_`systemd-resolved`_ provides a set of APIs on the bus for resolving DNS resource records. These are: 1. _ResolveHostname()_ for resolving hostnames to acquire their IP addresses 2. _ResolveAddress()_ for the reverse operation: acquire the hostname for an IP address 3. _ResolveService()_ for resolving a DNS-SD or SRV service 4. _ResolveRecord()_ for resolving arbitrary resource records. -Below you'll find examples for two of these calls, to show how to use them. Note that glibc offers similar (and more portable) calls in _getaddrinfo()_, _getnameinfo()_ and _res_query()_. Of these _getaddrinfo()_ and _getnameinfo()_ are directed to the calls above via the _nss-resolve_ NSS module, but _req_query()_ is not. There are a number of reasons why it might be preferable to invoke _systemd-resolved_'s bus calls rather than the glibc APIs: +Below you'll find examples for two of these calls, to show how to use them. +Note that glibc offers similar (and more portable) calls in _getaddrinfo()_, _getnameinfo()_ and _res\_query()_. +Of these _getaddrinfo()_ and _getnameinfo()_ are directed to the calls above via the _nss-resolve_ NSS module, but _req\_query()_ is not. +There are a number of reasons why it might be preferable to invoke `systemd-resolved`'s bus calls rather than the glibc APIs: 1. Bus APIs are naturally asynchronous, which the glibc APIs generally are not. -2. The bus calls above pass back substantially more information about the resolved data, including where and how the data was found (i.e. which protocol was used: DNS, LLMNR, MulticastDNS, and on which network interface), and most importantly, whether the data could be authenticated via DNSSEC. This in particular makes these APIs useful for retrieving certificate data from the DNS, in order to implement DANE, SSHFP, OPENGPGKEY and IPSECKEY clients. +2. The bus calls above pass back substantially more information about the resolved data, including where and how the data was found + (i.e. which protocol was used: DNS, LLMNR, MulticastDNS, and on which network interface), and most importantly, whether the data could be authenticated via DNSSEC. + This in particular makes these APIs useful for retrieving certificate data from the DNS, in order to implement DANE, SSHFP, OPENGPGKEY and IPSECKEY clients. 3. _ResolveService()_ knows no counterpart in glibc, and has the benefit of being a single call that collects all data necessary to connect to a DNS-SD or pure SRV service in one step. -4. _ResolveRecord()_ in contrast to _res_query()_ supports LLMNR and MulticastDNS as protocols on top of DNS, and makes use of _systemd-resolved_'s local DNS record cache. The processing of the request is done in the sandboxed _systemd-resolved_ process rather than in the local process, and all packets are pre-validated. Because this relies on _systemd-resolved_ the per-interface DNS zone handling is supported. +4. _ResolveRecord()_ in contrast to _res\_query()_ supports LLMNR and MulticastDNS as protocols on top of DNS, and makes use of `systemd-resolved`'s local DNS record cache. + The processing of the request is done in the sandboxed `systemd-resolved` process rather than in the local process, and all packets are pre-validated. + Because this relies on `systemd-resolved` the per-interface DNS zone handling is supported. -Of course, by using _systemd-resolved_ you lose some portability, but this could be handled via an automatic fallback to the glibc counterparts. +Of course, by using `systemd-resolved` you lose some portability, but this could be handled via an automatic fallback to the glibc counterparts. -Note that the various resolver calls provided by _systemd-resolved_ will consult _/etc/hosts_ and synthesize resource records for these entries in order to ensure that this file is honoured fully. +Note that the various resolver calls provided by `systemd-resolved` will consult `/etc/hosts` and synthesize resource records for these entries in order to ensure that this file is honoured fully. -The examples below use the _sd-bus_ D-Bus client implementation, which is part of _libsystemd_. Any other D-Bus library, including the original _libdbus_ or _GDBus_ may be used too. +The examples below use the _sd-bus_ D-Bus client implementation, which is part of _libsystemd_. +Any other D-Bus library, including the original _libdbus_ or _GDBus_ may be used too. ## Resolving a Hostname -To resolve a hostname use the _ResolveHostname()_ call. For details on the function parameters see the [bus API documentation](https://wiki.freedesktop.org/www/Software/systemd/resolved/). +To resolve a hostname use the _ResolveHostname()_ call. For details on the function parameters see the [bus API documentation](https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.resolve1.html). -This example specifies _AF_UNSPEC_ as address family for the requested address. This means both an _AF_INET_ (A) and an _AF_INET6_ (AAAA) record is looked for, depending on whether the local system has configured IPv4 and/or IPv6 connectivity. It is generally recommended to request _AF_UNSPEC_ addresses for best compatibility with both protocols, in particular on dual-stack systems. +This example specifies `AF_UNSPEC` as address family for the requested address. +This means both an _AF\_INET_ (A) and an _AF\_INET6_ (AAAA) record is looked for, depending on whether the local system has configured IPv4 and/or IPv6 connectivity. +It is generally recommended to request `AF_UNSPEC` addresses for best compatibility with both protocols, in particular on dual-stack systems. -The example specifies a network interface index of "0", i.e. does not specify any at all, so that the request may be done on any. Note that the interface index is primarily relevant for LLMNR and MulticastDNS lookups, which distinguish different scopes for each network interface index. +The example specifies a network interface index of "0", i.e. does not specify any at all, so that the request may be done on any. +Note that the interface index is primarily relevant for LLMNR and MulticastDNS lookups, which distinguish different scopes for each network interface index. -This examples makes no use of either the input flags parameter, nor the output flags parameter. See the _ResolveRecord()_ example below for information how to make use of the _SD_RESOLVED_AUTHENTICATED_ bit in the returned flags parameter. +This examples makes no use of either the input flags parameter, nor the output flags parameter. +See the _ResolveRecord()_ example below for information how to make use of the _SD\_RESOLVED\_AUTHENTICATED_ bit in the returned flags parameter. -``` +```c #include #include #include @@ -137,16 +149,20 @@ gcc addrtest.c -o addrtest -Wall `pkg-config --cflags --libs libsystemd` ## Resolving an Arbitrary DNS Resource Record -Use `ResolveRecord()` in order to resolve arbitrary resource records. The call will return the binary RRset data. This calls is useful to acquire resource records for which no high-level calls such as ResolveHostname(), ResolveAddress() and ResolveService() exist. In particular RRs such as MX, SSHFP, TLSA, CERT, OPENPGPKEY or IPSECKEY may be requested via this API. +Use `ResolveRecord()` in order to resolve arbitrary resource records. The call will return the binary RRset data. +This calls is useful to acquire resource records for which no high-level calls such as ResolveHostname(), ResolveAddress() and ResolveService() exist. +In particular RRs such as MX, SSHFP, TLSA, CERT, OPENPGPKEY or IPSECKEY may be requested via this API. This example also shows how to determine whether the acquired data has been authenticated via DNSSEC (or another means) by checking the `SD_RESOLVED_AUTHENTICATED` bit in the returned `flags` parameter. -This example contains a simple MX record parser. Note that the data comes pre-validated from `systemd-resolved`, hence we allow the example to parse the record slightly sloppily, to keep the example brief. For details on the MX RR binary format, see [RFC 1035](https://www.rfc-editor.org/rfc/rfc1035.txt). +This example contains a simple MX record parser. +Note that the data comes pre-validated from `systemd-resolved`, hence we allow the example to parse the record slightly sloppily, to keep the example brief. +For details on the MX RR binary format, see [RFC 1035](https://www.rfc-editor.org/rfc/rfc1035.txt). -For details on the function parameters see the [bus API documentation](https://wiki.freedesktop.org/www/Software/systemd/resolved/). +For details on the function parameters see the [bus API documentation](https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.resolve1.html). -``` +```c #include #include #include diff --git a/docs/WRITING_VM_AND_CONTAINER_MANAGERS.md b/docs/WRITING_VM_AND_CONTAINER_MANAGERS.md index 4d1b649..e3cc280 100644 --- a/docs/WRITING_VM_AND_CONTAINER_MANAGERS.md +++ b/docs/WRITING_VM_AND_CONTAINER_MANAGERS.md @@ -5,25 +5,58 @@ layout: default SPDX-License-Identifier: LGPL-2.1-or-later --- - # Writing VM and Container Managers _Or: How to hook up your favorite VM or container manager with systemd_ -Nomenclature: a _Virtual Machine_ shall refer to a system running on virtualized hardware consisting of a full OS with its own kernel. A _Container_ shall refer to a system running on the same shared kernel of the host, but running a mostly complete OS with its own init system. Both kinds of virtualized systems shall collectively be called "machines". +Nomenclature: a _Virtual Machine_ shall refer to a system running on +virtualized hardware consisting of a full OS with its own kernel. A _Container_ +shall refer to a system running on the same shared kernel of the host, but +running a mostly complete OS with its own init system. Both kinds of +virtualized systems shall collectively be called "machines". -systemd provides a number of integration points with virtual machine and container managers, such as libvirt, LXC or systemd-nspawn. On one hand there are integration points of the VM/container manager towards the host OS it is running on, and on the other there integration points for container managers towards the guest OS it is managing. +systemd provides a number of integration points with virtual machine and +container managers, such as libvirt, LXC or systemd-nspawn. On one hand there +are integration points of the VM/container manager towards the host OS it is +running on, and on the other there integration points for container managers +towards the guest OS it is managing. -Note that this document does not cover lightweight containers for the purpose of application sandboxes, i.e. containers that do _not_ run a init system of their own. +Note that this document does not cover lightweight containers for the purpose +of application sandboxes, i.e. containers that do _not_ run a init system of +their own. ## Host OS Integration -All virtual machines and containers should be registered with the [machined](http://www.freedesktop.org/wiki/Software/systemd/machined) mini service that is part of systemd. This provides integration into the core OS at various points. For example, tools like ps, cgls, gnome-system-manager use this registration information to show machine information for running processes, as each of the VM's/container's processes can reliably attributed to a registered machine. The various systemd tools (like systemctl, journalctl, loginctl, systemd-run, ...) all support a -M switch that operates on machines registered with machined. "machinectl" may be used to execute operations on any such machine. When a machine is registered via machined its processes will automatically be placed in a systemd scope unit (that is located in the machines.slice slice) and thus appear in "systemctl" and similar commands. The scope unit name is based on the machine meta information passed to machined at registration. - -For more details on the APIs provided by machine consult [the bus API interface documentation](http://www.freedesktop.org/wiki/Software/systemd/machined). +All virtual machines and containers should be registered with the +[systemd-machined(8)](https://www.freedesktop.org/software/systemd/man/latest/systemd-machined.service.html) +mini service that is part of systemd. This provides integration into the core +OS at various points. For example, tools like ps, cgls, gnome-system-manager +use this registration information to show machine information for running +processes, as each of the VM's/container's processes can reliably attributed to +a registered machine. The various systemd tools (like systemctl, journalctl, +loginctl, systemd-run, ...) all support a -M switch that operates on machines +registered with machined. "machinectl" may be used to execute operations on any +such machine. When a machine is registered via machined its processes will +automatically be placed in a systemd scope unit (that is located in the +machines.slice slice) and thus appear in "systemctl" and similar commands. The +scope unit name is based on the machine meta information passed to machined at +registration. + +For more details on the APIs provided by machine consult [the bus API interface +documentation](https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.machine1.html). ## Guest OS Integration -As container virtualization is much less comprehensive, and the guest is less isolated from the host, there are a number of interfaces defined how the container manager can set up the environment for systemd running inside a container. These Interfaces are documented in [Container Interface of systemd](http://www.freedesktop.org/wiki/Software/systemd/ContainerInterface). - -VM virtualization is more comprehensive and fewer integration APIs are available. In fact there's only one: a VM manager may initialize the SMBIOS DMI field "Product UUUID" to a UUID uniquely identifying this virtual machine instance. This is read in the guest via /sys/class/dmi/id/product_uuid, and used as configuration source for /etc/machine-id if in the guest, if that file is not initialized yet. Note that this is currently only supported for kvm hosts, but may be extended to other managers as well. +As container virtualization is much less comprehensive, and the guest is less +isolated from the host, there are a number of interfaces defined how the +container manager can set up the environment for systemd running inside a +container. These Interfaces are documented in [Container Interface of +systemd](https://systemd.io/CONTAINER_INTERFACE). + +VM virtualization is more comprehensive and fewer integration APIs are +available. In fact there's only one: a VM manager may initialize the SMBIOS DMI +field "Product UUUID" to a UUID uniquely identifying this virtual machine +instance. This is read in the guest via /sys/class/dmi/id/product_uuid, and +used as configuration source for /etc/machine-id if in the guest, if that file +is not initialized yet. Note that this is currently only supported for kvm +hosts, but may be extended to other managers as well. diff --git a/docs/assets/f17boot.png b/docs/assets/f17boot.png new file mode 100644 index 0000000..8415b81 Binary files /dev/null and b/docs/assets/f17boot.png differ diff --git a/man/common-variables.xml b/man/common-variables.xml index 81425e5..1aa31e1 100644 --- a/man/common-variables.xml +++ b/man/common-variables.xml @@ -137,6 +137,9 @@ + Note that setting the regular $LESS environment variable has no effect + for less invocations by systemd tools. + See less1 for more discussion. @@ -146,7 +149,10 @@ $SYSTEMD_LESSCHARSET Override the charset passed to less (by default utf-8, if - the invoking terminal is determined to be UTF-8 compatible). + the invoking terminal is determined to be UTF-8 compatible). + + Note that setting the regular $LESSCHARSET environment variable has no effect + for less invocations by systemd tools. diff --git a/man/crypttab.xml b/man/crypttab.xml index e94bf1c..126d3a0 100644 --- a/man/crypttab.xml +++ b/man/crypttab.xml @@ -126,7 +126,7 @@ For the latter five mechanisms the source for the key material used for unlocking the volume is primarily configured in the third field of each /etc/crypttab line, but may also - configured in /etc/cryptsetup-keys.d/ and + be configured in /etc/cryptsetup-keys.d/ and /run/cryptsetup-keys.d/ (see above) or in the LUKS2 JSON token header (in case of the latter three). Use the systemd-cryptenroll1 @@ -923,7 +923,7 @@ is acquired by connecting to the socket and reading the key from the connection. The connection is made from an AF_UNIX socket name in the abstract namespace, see unix7 for - details. The source socket name is chosen according the following format: + details. The source socket name is chosen according to the following format: NUL RANDOM /cryptsetup/ VOLUME diff --git a/man/custom-entities.ent.in b/man/custom-entities.ent.in index a854d11..9513235 100644 --- a/man/custom-entities.ent.in +++ b/man/custom-entities.ent.in @@ -18,5 +18,5 @@ - + diff --git a/man/custom-html.xsl b/man/custom-html.xsl index 8b21e15..2373bc3 100644 --- a/man/custom-html.xsl +++ b/man/custom-html.xsl @@ -81,13 +81,11 @@ - https://www.archlinux.org/ - - / + https://man.archlinux.org/man/ . - .html + .en.html diff --git a/man/daemon.xml b/man/daemon.xml index 8fa2506..819ff9b 100644 --- a/man/daemon.xml +++ b/man/daemon.xml @@ -75,7 +75,7 @@ create an independent session. In the child, call fork() again, to ensure that the daemon can - never re-acquire a terminal again. (This relevant if the program — and all its dependencies — does + never re-acquire a terminal again. (This is relevant if the program — and all its dependencies — does not carefully specify `O_NOCTTY` on each and every single `open()` call that might potentially open a TTY device node.) @@ -228,7 +228,7 @@ If the service opens sockets or other files on it own, and those file descriptors shall survive a restart, the daemon should store them in the service manager via sd_notify3 with - FDSTORE=1.. + FDSTORE=1. Instead of using the syslog() call to log directly to the system syslog service, a new-style daemon may choose to simply log to standard error via diff --git a/man/event-quick-child.c b/man/event-quick-child.c index 8195efb..b95ee1b 100644 --- a/man/event-quick-child.c +++ b/man/event-quick-child.c @@ -3,7 +3,7 @@ #include #include #include -#include +#include int main(int argc, char **argv) { pid_t pid = fork(); diff --git a/man/hwdb-usb-device.c b/man/hwdb-usb-device.c index 19a5db8..facd8c4 100644 --- a/man/hwdb-usb-device.c +++ b/man/hwdb-usb-device.c @@ -2,16 +2,16 @@ #include #include -#include +#include int print_usb_properties(uint16_t vid, uint16_t pid) { - char match[STRLEN("usb:vp") + DECIMAL_STR_MAX(uint16_t) * 2]; + char match[128]; sd_hwdb *hwdb; const char *key, *value; int r; /* Match this USB vendor and product ID combination */ - xsprintf(match, "usb:v%04Xp%04X", vid, pid); + snprintf(match, sizeof match, "usb:v%04Xp%04X", vid, pid); r = sd_hwdb_new(&hwdb); if (r < 0) diff --git a/man/journalctl.xml b/man/journalctl.xml index bdead3f..d1066b8 100644 --- a/man/journalctl.xml +++ b/man/journalctl.xml @@ -813,7 +813,7 @@ Commands - The following commands are understood. If none is specified the default is to display journal records. + The following commands are understood. If none is specified the default is to display journal records: diff --git a/man/kernel-command-line.xml b/man/kernel-command-line.xml index 6ac20ad..004b394 100644 --- a/man/kernel-command-line.xml +++ b/man/kernel-command-line.xml @@ -688,6 +688,28 @@ + + + systemd.battery_check= + + Accepts a boolean argument. If false the boot-time battery charge check implemented + by + systemd-battery-check.service8 + is disabled. + + + + + + ifname= + net.ifname_policy= + + Controls interface naming policies, implemented by + systemd-network-generator.service8. + + + + diff --git a/man/notify-selfcontained-example.c b/man/notify-selfcontained-example.c new file mode 100644 index 0000000..9a7553e --- /dev/null +++ b/man/notify-selfcontained-example.c @@ -0,0 +1,173 @@ +/* SPDX-License-Identifier: MIT-0 */ + +/* Implement the systemd notify protocol without external dependencies. + * Supports both readiness notification on startup and on reloading, + * according to the protocol defined at: + * https://www.freedesktop.org/software/systemd/man/latest/sd_notify.html + * This protocol is guaranteed to be stable as per: + * https://systemd.io/PORTABILITY_AND_STABILITY/ */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define _cleanup_(f) __attribute__((cleanup(f))) + +static void closep(int *fd) { + if (!fd || *fd < 0) + return; + + close(*fd); + *fd = -1; +} + +static int notify(const char *message) { + union sockaddr_union { + struct sockaddr sa; + struct sockaddr_un sun; + } socket_addr = { + .sun.sun_family = AF_UNIX, + }; + size_t path_length, message_length; + _cleanup_(closep) int fd = -1; + const char *socket_path; + + socket_path = getenv("NOTIFY_SOCKET"); + if (!socket_path) + return 0; /* Not running under systemd? Nothing to do */ + + if (!message) + return -EINVAL; + + message_length = strlen(message); + if (message_length == 0) + return -EINVAL; + + /* Only AF_UNIX is supported, with path or abstract sockets */ + if (socket_path[0] != '/' && socket_path[0] != '@') + return -EAFNOSUPPORT; + + path_length = strlen(socket_path); + /* Ensure there is room for NUL byte */ + if (path_length >= sizeof(socket_addr.sun.sun_path)) + return -E2BIG; + + memcpy(socket_addr.sun.sun_path, socket_path, path_length); + + /* Support for abstract socket */ + if (socket_addr.sun.sun_path[0] == '@') + socket_addr.sun.sun_path[0] = 0; + + fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0); + if (fd < 0) + return -errno; + + if (connect(fd, &socket_addr.sa, offsetof(struct sockaddr_un, sun_path) + path_length) != 0) + return -errno; + + ssize_t written = write(fd, message, message_length); + if (written != (ssize_t) message_length) + return written < 0 ? -errno : -EPROTO; + + return 1; /* Notified! */ +} + +static int notify_ready(void) { + return notify("READY=1"); +} + +static int notify_reloading(void) { + /* A buffer with length sufficient to format the maximum UINT64 value. */ + char reload_message[sizeof("RELOADING=1\nMONOTONIC_USEC=18446744073709551615")]; + struct timespec ts; + uint64_t now; + + /* Notify systemd that we are reloading, including a CLOCK_MONOTONIC timestamp in usec + * so that the program is compatible with a Type=notify-reload service. */ + + if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0) + return -errno; + + if (ts.tv_sec < 0 || ts.tv_nsec < 0 || + (uint64_t) ts.tv_sec > (UINT64_MAX - (ts.tv_nsec / 1000ULL)) / 1000000ULL) + return -EINVAL; + + now = (uint64_t) ts.tv_sec * 1000000ULL + (uint64_t) ts.tv_nsec / 1000ULL; + + if (snprintf(reload_message, sizeof(reload_message), "RELOADING=1\nMONOTONIC_USEC=%" PRIu64, now) < 0) + return -EINVAL; + + return notify(reload_message); +} + +static volatile sig_atomic_t reloading = 0; +static volatile sig_atomic_t terminating = 0; + +static void signal_handler(int sig) { + if (sig == SIGHUP) + reloading = 1; + else if (sig == SIGINT || sig == SIGTERM) + terminating = 1; +} + +int main(int argc, char **argv) { + struct sigaction sa = { + .sa_handler = signal_handler, + .sa_flags = SA_RESTART, + }; + int r; + + /* Setup signal handlers */ + sigemptyset(&sa.sa_mask); + sigaction(SIGHUP, &sa, NULL); + sigaction(SIGINT, &sa, NULL); + sigaction(SIGTERM, &sa, NULL); + + /* Do more service initialization work here … */ + + /* Now that all the preparations steps are done, signal readiness */ + + r = notify_ready(); + if (r < 0) { + fprintf(stderr, "Failed to notify readiness to $NOTIFY_SOCKET: %s\n", strerror(-r)); + return EXIT_FAILURE; + } + + while (!terminating) { + if (reloading) { + reloading = false; + + /* As a separate but related feature, we can also notify the manager + * when reloading configuration. This allows accurate state-tracking, + * and also automated hook-in of 'systemctl reload' without having to + * specify manually an ExecReload= line in the unit file. */ + + r = notify_reloading(); + if (r < 0) { + fprintf(stderr, "Failed to notify reloading to $NOTIFY_SOCKET: %s\n", strerror(-r)); + return EXIT_FAILURE; + } + + /* Do some reconfiguration work here … */ + + r = notify_ready(); + if (r < 0) { + fprintf(stderr, "Failed to notify readiness to $NOTIFY_SOCKET: %s\n", strerror(-r)); + return EXIT_FAILURE; + } + } + + /* Do some daemon work here … */ + sleep(5); + } + + return EXIT_SUCCESS; +} diff --git a/man/org.freedesktop.resolve1.xml b/man/org.freedesktop.resolve1.xml index f9cba4f..c6b0153 100644 --- a/man/org.freedesktop.resolve1.xml +++ b/man/org.freedesktop.resolve1.xml @@ -156,16 +156,6 @@ node /org/freedesktop/resolve1 { }; - - - - - - - - - - @@ -433,6 +423,30 @@ node /org/freedesktop/resolve1 { The RevertLink() method may be used to revert all per-link settings described above to the defaults. + The FlushCaches() flushes all resource record caches maintained by the + resolver, and ensures that any subsequent lookups re-request their responses from their sources. + + The ResetServerFeatures() flushes any feature information learned about + remote DNS servers. This ensures that subsequent lookups will be initially attempted at the highest DNS + protocol feature level again, possibly requiring a (potentially slow) downgrade cycle to recognize the + supported feature level again. + + The RegisterService() method may be used to register a DNS-SD service on the + host. This functionality is closely related to the functionality provided by + systemd.dnssd5 + files. It takes a server identifier string as first parameter (this is jus a local identifier, and + should be chosen so that it neither collides with the basename of *.dnssd files + nor with names chosen by other IPC clients). It also takes a name template string for the DNS-SD + service name visible on the network. This string is subject to specifier expansation, as documented for + the Name= setting in *.dnssd files. It also takes a service + type string containing the DNS-SD service type, as well as an IP port, a priority/weight pair for the + DNS-SD SRV record. Finally, it takes an array of TXT record data. It returns an object path which may be + used as handle to the registered service. + + The UnregisterService() method undoes the effect of + RegisterService() and deletes a DNS-SD service previously created via IPC + again. + The Flags Parameter @@ -635,6 +649,9 @@ node /org/freedesktop/resolve1 { enabled. Possible values are yes (enabled), no (disabled), udp (only the UDP listener is enabled), and tcp (only the TCP listener is enabled). + + The DNSSECNegativeTrustAnchors property contains a list of recognized DNSSEC + negative trust anchors and contains a list of domains. @@ -689,8 +706,6 @@ node /org/freedesktop/resolve1/link/_1 { }; - - diff --git a/man/path-documents.c b/man/path-documents.c index a357dd6..994f20b 100644 --- a/man/path-documents.c +++ b/man/path-documents.c @@ -2,7 +2,7 @@ #include #include -#include +#include int main(void) { int r; diff --git a/man/portablectl.xml b/man/portablectl.xml index 03ca65e..c1946d7 100644 --- a/man/portablectl.xml +++ b/man/portablectl.xml @@ -45,12 +45,12 @@ within the file system context of the image. Portable service images are an efficient way to bundle multiple related services and other units together, - and transfer them as a whole between systems. When these images are attached the local system the contained units + and transfer them as a whole between systems. When these images are attached to the local system, the contained units may run in most ways like regular system-provided units, either with full privileges or inside strict sandboxing, depending on the selected configuration. For more details, see Portable Services Documentation. - Specifically portable service images may be of the following kind: + Portable service images may be of the following kinds: Directory trees containing an OS, including the top-level directories /usr/, @@ -397,7 +397,7 @@ multiple times, in which case the order in which images are laid down follows the rules specified in systemd.exec5 for the ExtensionImages= directive and for the - systemd-sysext8 and. + systemd-sysext8 and systemd-confext8 tools. The images must contain an extension-release file with metadata that matches what is defined in the os-release of IMAGE. See: diff --git a/man/repart.d.xml b/man/repart.d.xml index 79908a0..efca8d5 100644 --- a/man/repart.d.xml +++ b/man/repart.d.xml @@ -382,7 +382,7 @@ The file specified here must have a size that is a multiple of the basic block size 512 and not be empty. If this option is used, the size allocation algorithm is slightly altered: the partition is - created as least as big as required to fit the data in, i.e. the data size is an additional minimum + created at least as big as required to fit the data in, i.e. the data size is an additional minimum size value taken into consideration for the allocation algorithm, similar to and in addition to the SizeMin= value configured above. @@ -406,7 +406,7 @@ squashfs or the special value swap. If specified and the partition is newly created it is formatted with the specified file system (or as swap device). The file system UUID and label are automatically derived from the partition UUID and label. If this option is used, - the size allocation algorithm is slightly altered: the partition is created as least as big as + the size allocation algorithm is slightly altered: the partition is created at least as big as required for the minimal file system of the specified type (or 4KiB if the minimal size is not known). diff --git a/man/sd_bus_error-example.c b/man/sd_bus_error-example.c index 9b162eb..4b5217c 100644 --- a/man/sd_bus_error-example.c +++ b/man/sd_bus_error-example.c @@ -3,7 +3,7 @@ #include #include #include -#include +#include int writer_with_negative_errno_return(int fd, sd_bus_error *error) { const char *message = "Hello, World!\n"; diff --git a/man/sd_event_add_io.xml b/man/sd_event_add_io.xml index da0fa58..9d4fd27 100644 --- a/man/sd_event_add_io.xml +++ b/man/sd_event_add_io.xml @@ -216,16 +216,20 @@ source object and returns the non-negative file descriptor or a negative error number on error (see below). - sd_event_source_set_io_fd() - changes the UNIX file descriptor of an I/O event source created - previously with sd_event_add_io(). It takes - the event source object and the new file descriptor. - - sd_event_source_set_io_fd_own() controls whether the file descriptor of the event source - shall be closed automatically when the event source is freed, i.e. whether it shall be considered 'owned' by the - event source object. By default it is not closed automatically, and the application has to do this on its own. The - b parameter is a boolean parameter: if zero, the file descriptor is not closed automatically - when the event source is freed, otherwise it is closed. + sd_event_source_set_io_fd() changes the UNIX file descriptor of an I/O event + source created previously with sd_event_add_io(). It takes the event source object + and the new file descriptor. If the event source takes the ownership of the previous file descriptor, + that is, sd_event_source_set_io_fd_own() was called for the event source with a + non-zero value, then the previous file descriptor will be closed and the event source will also take the + ownership of the new file descriptor on success. + + sd_event_source_set_io_fd_own() controls whether the file descriptor of the + event source shall be closed automatically when the event source is freed (or when the file descriptor + assigned to the event source is replaced by sd_event_source_set_io_fd()), i.e. + whether it shall be considered 'owned' by the event source object. By default it is not closed + automatically, and the application has to do this on its own. The b parameter is a + boolean parameter: if zero, the file descriptor is not closed automatically when the event source is + freed, otherwise it is closed. sd_event_source_get_io_fd_own() may be used to query the current setting of the file descriptor ownership boolean flag as set with sd_event_source_set_io_fd_own(). It returns diff --git a/man/sd_journal_get_cursor.xml b/man/sd_journal_get_cursor.xml index 0baae03..29b8bc7 100644 --- a/man/sd_journal_get_cursor.xml +++ b/man/sd_journal_get_cursor.xml @@ -61,12 +61,6 @@ and should be freed after use with free3. - Note that sd_journal_get_cursor() will - not work before - sd_journal_next3 - (or related call) has been called at least once, in order to - position the read pointer at a valid entry. - sd_journal_test_cursor() may be used to check whether the current position in the journal matches the specified cursor. This is @@ -75,10 +69,17 @@ multiple different cursor strings, and hence string comparing cursors is not possible. Use this call to verify after an invocation of - sd_journal_seek_cursor3 + sd_journal_seek_cursor3, whether the entry being sought to was actually found in the journal or the next closest entry was used instead. + + Note that sd_journal_get_cursor() + and sd_journal_test_cursor() + will not work before + sd_journal_next3 + (or one of the other functions which move to an entry) + has been called at least once to position the read pointer at a valid entry. diff --git a/man/sd_journal_seek_head.xml b/man/sd_journal_seek_head.xml index 15c72c0..03aec1d 100644 --- a/man/sd_journal_seek_head.xml +++ b/man/sd_journal_seek_head.xml @@ -82,8 +82,7 @@ string. For details on cursors, see sd_journal_get_cursor3. If no entry matching the specified cursor is found the call will seek to the next closest entry (in terms - of time) instead. To verify whether the newly selected entry actually matches the cursor, use - sd_journal_test_cursor3. + of time) instead. Note that these calls do not actually make any entry the new current entry, this needs to be done in a separate step with a subsequent @@ -97,6 +96,13 @@ used, the closest following entry will be sought to, if sd_journal_previous3 is used the closest preceding entry is sought to. + + After the seek is done, and + sd_journal_next3 + or a similar call has been made, + sd_journal_test_cursor3 + may be used to verify whether the newly selected entry actually matches the cursor. + diff --git a/man/sd_notify.xml b/man/sd_notify.xml index 7c32a22..1b96c83 100644 --- a/man/sd_notify.xml +++ b/man/sd_notify.xml @@ -486,6 +486,11 @@ (i.e.: lower than 1024), as an attempt to address concerns that unprivileged processes in the guest might try to send malicious notifications to the host, driving it to make destructive decisions based on them. + + Note that, while using this library should be preferred in order to avoid code duplication, it is + also possible to reimplement the simple readiness notification protocol without external dependencies, + as demonstrated in the following self-contained example: + diff --git a/man/systemctl.xml b/man/systemctl.xml index 25b6e46..9bdd882 100644 --- a/man/systemctl.xml +++ b/man/systemctl.xml @@ -1318,36 +1318,49 @@ Jan 12 10:46:45 example.com bluetoothd[8900]: gatt-time-server: Input/output err show-environment - Dump the systemd manager environment block. This is the environment - block that is passed to all processes the manager spawns. The environment - block will be dumped in straightforward form suitable for sourcing into - most shells. If no special characters or whitespace is present in the variable - values, no escaping is performed, and the assignments have the form - VARIABLE=value. If whitespace or characters which have - special meaning to the shell are present, dollar-single-quote escaping is - used, and assignments have the form VARIABLE=$'value'. - This syntax is known to be supported by - bash1, - zsh1, - ksh1, - and - busybox1's - ash1, - but not - dash1 - or - fish1. + Dump the systemd manager environment block. This is the environment block that is passed to + all processes the manager spawns. The environment block will be dumped in straightforward form + suitable for sourcing into most shells. If no special characters or whitespace is present in the + variable values, no escaping is performed, and the assignments have the form + VARIABLE=value. If whitespace or characters which have special meaning to the + shell are present, dollar-single-quote escaping is used, and assignments have the form + VARIABLE=$'value'. This syntax is known to be supported by bash1, + zsh1, + ksh1, and + busybox1's + ash1, but + not dash1 or + fish1. + + Note that this shows the effective block, i.e. the combination of + environment variables configured via configuration files, environment generators and via IPC + (i.e. via the set-environment described below). At the moment a unit process + is forked off this combined environment block will be further combined with per-unit environment + variables, which are not visible in this command. set-environment VARIABLE=VALUE - Set one or more systemd manager environment variables, as specified on the command + Set one or more service manager environment variables, as specified on the command line. This command will fail if variable names and values do not conform to the rules listed above. + Note that this operates on an environment block separate from the environment block + configured from service manager configuration and environment generators. Whenever a process is + invoked the two blocks are combined (also incorporating any per-service environment variables), + and passed to it. The show-environment verb will show the combination of the + blocks, see above. + @@ -1355,11 +1368,16 @@ Jan 12 10:46:45 example.com bluetoothd[8900]: gatt-time-server: Input/output err unset-environment VARIABLE - Unset one or more systemd manager environment - variables. If only a variable name is specified, it will be - removed regardless of its value. If a variable and a value - are specified, the variable is only removed if it has the - specified value. + Unset one or more systemd manager environment variables. If only a variable name is + specified, it will be removed regardless of its value. If a variable and a value are specified, + the variable is only removed if it has the specified value. + + Note that this operates on an environment block separate from the environment block + configured from service manager configuration and environment generators. Whenever a process is + invoked the two blocks are combined (also incorporating any per-service environment variables), + and passed to it. The show-environment verb will show the combination of the + blocks, see above. Note that this means this command cannot be used to unset environment + variables defined in the service manager configuration files or via generators. diff --git a/man/systemd-bless-boot.service.xml b/man/systemd-bless-boot.service.xml index 66454d1..c39f248 100644 --- a/man/systemd-bless-boot.service.xml +++ b/man/systemd-bless-boot.service.xml @@ -37,7 +37,7 @@ boot counting is used. Internally, the service operates based on the LoaderBootCountPath EFI variable (of the - vendor UUID 4a67b082-0a4c-41cf-b6c7-440b29bb8c4), which is passed from the boot loader to the + vendor UUID 4a67b082-0a4c-41cf-b6c7-440b29bb8c4f), which is passed from the boot loader to the OS. It contains a file system path (relative to the EFI system partition) of the Boot Loader Specification compliant boot loader entry file or unified kernel image file that was used to boot up the diff --git a/man/systemd-bsod.service.xml b/man/systemd-bsod.service.xml index 9f54b40..502d239 100644 --- a/man/systemd-bsod.service.xml +++ b/man/systemd-bsod.service.xml @@ -18,7 +18,7 @@ systemd-bsod.service systemd-bsod - Displays boot-time emergency log message in full screen. + Displays boot-time emergency log message in full screen diff --git a/man/systemd-cryptsetup.xml b/man/systemd-cryptsetup.xml index 1d3a313..fb7861c 100644 --- a/man/systemd-cryptsetup.xml +++ b/man/systemd-cryptsetup.xml @@ -52,7 +52,7 @@ systemd-cryptsetup is used to set up (with attach) and tear down (with detach) access to an encrypted block device. It is primarily used via systemd-cryptsetup@.service during early boot, but may also be be called manually. - The positional arguments VOLUME, SOURCEDEVICE, + The positional arguments VOLUME, SOURCE-DEVICE, KEY-FILE, and CRYPTTAB-OPTIONS have the same meaning as the fields in crypttab5. diff --git a/man/systemd-hibernate-resume-generator.xml b/man/systemd-hibernate-resume-generator.xml index 9771350..9483cd9 100644 --- a/man/systemd-hibernate-resume-generator.xml +++ b/man/systemd-hibernate-resume-generator.xml @@ -28,14 +28,11 @@ Description - systemd-hibernate-resume-generator is a - generator that initiates the procedure to resume the system from hibernation. - It creates the + systemd-hibernate-resume-generator is a generator that initiates the procedure + to resume the system from hibernation. When kernel command line option or + HibernateLocation EFI variable is detected, it enables the systemd-hibernate-resume.service8 - unit according to the value of parameter - specified on the kernel command line, or the value of EFI variable - HibernateLocation, which will instruct the kernel - to resume the system from the hibernation image on that device. + unit, which will instruct the kernel to resume the system from the hibernation image. diff --git a/man/systemd-hibernate-resume.service.xml b/man/systemd-hibernate-resume.service.xml index 964c2bd..c9c8014 100644 --- a/man/systemd-hibernate-resume.service.xml +++ b/man/systemd-hibernate-resume.service.xml @@ -33,7 +33,7 @@ systemd-hibernate-resume only supports the in-kernel hibernation implementation, see Swap suspend. - Internally, it works by writing the major:minor of specified device node to + Internally, it works by writing the major:minor of selected device node to /sys/power/resume, along with the offset in memory pages (/sys/power/resume_offset) if supported. diff --git a/man/systemd-journald.service.xml b/man/systemd-journald.service.xml index 31435b2..7e252ae 100644 --- a/man/systemd-journald.service.xml +++ b/man/systemd-journald.service.xml @@ -245,6 +245,24 @@ systemd-tmpfiles --create --prefix /var/log/journal + + + systemd.journald.max_level_store= + systemd.journald.max_level_syslog= + systemd.journald.max_level_kmsg= + systemd.journald.max_level_console= + systemd.journald.max_level_wall= + systemd.journald.max_level_socket= + + Controls the maximum log level of messages that are stored in the journal, forwarded + to syslog, kmsg, the console, the wall, or a socket. This kernel command line options override the + settings of the same names in the + journald.conf5 + file. + + + + Note that these kernel command line options are only honoured by the default namespace, see diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml index e1e6d84..349bc8a 100644 --- a/man/systemd-nspawn.xml +++ b/man/systemd-nspawn.xml @@ -1734,7 +1734,7 @@ After=sys-subsystem-net-devices-ens1.device In order to embed binary data into the credential data for , use C-style escaping (i.e. \n to embed a newline, or \x00 to embed a NUL byte). Note that the invoking shell might already apply unescaping - once, hence this might require double escaping!. + once, hence this might require double escaping! The systemd-sysusers.service8 diff --git a/man/systemd-pcrlock.xml b/man/systemd-pcrlock.xml index f82268c..a364dd3 100644 --- a/man/systemd-pcrlock.xml +++ b/man/systemd-pcrlock.xml @@ -389,7 +389,7 @@ Generates/removes a .pcrlock file based on raw binary data. The data is either read from the specified file or from STDIN (if none is specified). This requires that - is specified. The generated pcrlock file is written to the file specified + is specified. The generated .pcrlock file is written to the file specified via or to STDOUT (if none is specified). @@ -436,7 +436,7 @@ - Specifies to NV index to store the policy in. Honoured by + Specifies the NV index to store the policy in. Honoured by make-policy. If not specified the command will automatically pick a free NV index. diff --git a/man/systemd-poweroff.service.xml b/man/systemd-poweroff.service.xml index b430170..1484a45 100644 --- a/man/systemd-poweroff.service.xml +++ b/man/systemd-poweroff.service.xml @@ -60,7 +60,7 @@ kexec, depending on the chosen action. All executables in this directory are executed in parallel, and execution of the action is not continued before all executables finished. Note that these executables are run after all services have been shut down, and after most - mounts have been detached (the root file system as well as /run/ and various API + mounts have been unmounted (the root file system as well as /run/ and various API file systems are still around though). This means any programs dropped into this directory must be prepared to run in such a limited execution environment and not rely on external services or hierarchies such as /var/ to be around (or writable). diff --git a/man/systemd-repart.xml b/man/systemd-repart.xml index 5cd4c1c..27fa257 100644 --- a/man/systemd-repart.xml +++ b/man/systemd-repart.xml @@ -599,7 +599,7 @@ Generate a configuration extension image The following creates a configuration extension DDI (confext) for an - /etc/motd update. + /etc/motd update: mkdir tree tree/etc tree/etc/extension-release.d echo "Hello World" > tree/etc/motd diff --git a/man/systemd-socket-proxyd.xml b/man/systemd-socket-proxyd.xml index 57a6827..59a1073 100644 --- a/man/systemd-socket-proxyd.xml +++ b/man/systemd-socket-proxyd.xml @@ -53,6 +53,11 @@ Accept=no and an event-driven design that scales better with the number of connections. + + Note that systemd-socket-proxyd will not forward socket side channel + information, i.e. will not forward SCM_RIGHTS, SCM_CREDENTIALS, + SCM_SECURITY, SO_PEERCRED, SO_PEERPIDFD, + SO_PEERSEC, SO_PEERGROUPS and similar. Options diff --git a/man/systemd-soft-reboot.service.xml b/man/systemd-soft-reboot.service.xml index e83e18f..c4ee207 100644 --- a/man/systemd-soft-reboot.service.xml +++ b/man/systemd-soft-reboot.service.xml @@ -104,7 +104,7 @@ via SurviveFinalKillSignal=yes, and also be configured to avoid being stopped on isolate via IgnoreOnIsolate=yes. They also have to be configured to be stopped on normal shutdown, reboot and maintenance mode. Finally, they have to be ordered after - basic.target to ensure correct ordeering on boot. Note that in case any new or + basic.target to ensure correct ordering on boot. Note that in case any new or custom units are used to isolate to, or that implement an equivalent shutdown functionality, they will also have to be configured manually for correct ordering and conflicting. For example: diff --git a/man/systemd-storagetm.service.xml b/man/systemd-storagetm.service.xml index 4fa7958..1834f85 100644 --- a/man/systemd-storagetm.service.xml +++ b/man/systemd-storagetm.service.xml @@ -67,7 +67,7 @@ exposed NVMe-TCP mass storage devices. The NQN should follow the syntax described in NVM Express Base Specification 2.0c, section 4.5 "NVMe Qualified Names". Note that the NQN - specified here will be suffixed with a dot and the the block device name before it is exposed on the + specified here will be suffixed with a dot and the block device name before it is exposed on the NVMe target. If not specified defaults to nqn.2023-10.io.systemd:storagetm.ID, where ID is replaced by a 128bit ID derived from diff --git a/man/systemd-sysext.xml b/man/systemd-sysext.xml index 7607693..3f0a0c2 100644 --- a/man/systemd-sysext.xml +++ b/man/systemd-sysext.xml @@ -140,7 +140,7 @@ but the used architecture identifiers are the same as for ConditionArchitecture= described in systemd.unit5. EXTENSION_RELOAD_MANAGER= can be set to 1 if the extension requires a service manager reload after application - of the extension. Note that the for the reasons mentioned earlier: + of the extension. Note that for the reasons mentioned earlier: Portable Services remain the recommended way to ship system services. diff --git a/man/systemd-timedated.service.xml b/man/systemd-timedated.service.xml index 112bdf3..c0314d5 100644 --- a/man/systemd-timedated.service.xml +++ b/man/systemd-timedated.service.xml @@ -63,7 +63,7 @@ List of network time synchronization services - systemd-timesyncd will look for files with a .list extension + systemd-timedated will look for files with a .list extension in ntp-units.d/ directories. Each file is parsed as a list of unit names, one per line. Empty lines and lines with comments (#) are ignored. Files are read from /usr/lib/systemd/ntp-units.d/ and the corresponding directories under @@ -82,7 +82,7 @@ systemd-timesyncd.service If the environment variable $SYSTEMD_TIMEDATED_NTP_SERVICES is set, - systemd-timesyncd will parse the contents of that variable as a colon-separated list + systemd-timedated will parse the contents of that variable as a colon-separated list of unit names. When set, this variable overrides the file-based list described above. diff --git a/man/systemd-tpm2-setup.service.xml b/man/systemd-tpm2-setup.service.xml index 8c13895..505183a 100644 --- a/man/systemd-tpm2-setup.service.xml +++ b/man/systemd-tpm2-setup.service.xml @@ -37,7 +37,7 @@ The services will store the public key of the SRK key pair in a PEM file in /run/systemd/tpm2-srk-public-key.pem and - /var/lib/systemd/tpm2-srk-public-key.pem. It will also store it in TPM2B_PUBLIC + /var/lib/systemd/tpm2-srk-public-key.pem. They will also store it in TPM2B_PUBLIC format in /run/systemd/tpm2-srk-public-key.tpm2_public and /var/lib/systemd/tpm2-srk-public-key.tpm2b_public. diff --git a/man/systemd-vmspawn.xml b/man/systemd-vmspawn.xml index fa55f8e..9bec440 100644 --- a/man/systemd-vmspawn.xml +++ b/man/systemd-vmspawn.xml @@ -18,7 +18,7 @@ systemd-vmspawn - Spawn an OS in a virtual machine. + Spawn an OS in a virtual machine @@ -171,7 +171,7 @@ In order to embed binary data into the credential data for , use C-style escaping (i.e. \n to embed a newline, or \x00 to embed a NUL byte). Note that the invoking shell might already apply unescaping - once, hence this might require double escaping!. + once, hence this might require double escaping! diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index a671649..1e95a94 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1369,6 +1369,11 @@ CapabilityBoundingSet=~CAP_B CAP_C accessible to privileged processes. However, most namespacing settings, that will not work on their own in user services, will work when used in conjunction with PrivateUsers=. + Note that the various options that turn directories read-only (such as + ProtectSystem=, ReadOnlyPaths=, …) do not affect the ability for + programs to connect to and communicate with AF_UNIX sockets in these + directores. These options cannot be used to lock down access to IPC services hence. + @@ -1382,14 +1387,16 @@ CapabilityBoundingSet=~CAP_B CAP_C mounted read-only, except for the API file system subtrees /dev/, /proc/ and /sys/ (protect these directories using PrivateDevices=, ProtectKernelTunables=, - ProtectControlGroups=). This setting ensures that any modification of the vendor-supplied - operating system (and optionally its configuration, and local mounts) is prohibited for the service. It is - recommended to enable this setting for all long-running services, unless they are involved with system updates - or need to modify the operating system in other ways. If this option is used, - ReadWritePaths= may be used to exclude specific directories from being made read-only. This - setting is implied if DynamicUser= is set. This setting cannot ensure protection in all - cases. In general it has the same limitations as ReadOnlyPaths=, see below. Defaults to - off. + ProtectControlGroups=). This setting ensures that any modification of the + vendor-supplied operating system (and optionally its configuration, and local mounts) is prohibited + for the service. It is recommended to enable this setting for all long-running services, unless they + are involved with system updates or need to modify the operating system in other ways. If this option + is used, ReadWritePaths= may be used to exclude specific directories from being + made read-only. Similar, StateDirectory=, LogsDirectory=, … and + related directory settings (see below) also exclude the specific directories from the effect of + ProtectSystem=. This setting is implied if DynamicUser= is + set. This setting cannot ensure protection in all cases. In general it has the same limitations as + ReadOnlyPaths=, see below. Defaults to off. @@ -3121,6 +3128,9 @@ StandardInputData=V2XigLJyZSBubyBzdHJhbmdlcnMgdG8gbG92ZQpZb3Uga25vdyB0aGUgcnVsZX values are indexed may also be used to implement cross-unit log record matching. Assign an empty string to reset the list. + Note that this functionality is currently only available in system services, not in per-user + services. + @@ -3176,6 +3186,9 @@ StandardInputData=V2XigLJyZSBubyBzdHJhbmdlcnMgdG8gbG92ZQpZb3Uga25vdyB0aGUgcnVsZX the kernel log buffer (kmsg), the systemd console, or sent as wall messages to all logged-in users. + Note that this functionality is currently only available in system services, not in per-user + services. + diff --git a/man/systemd.network.xml b/man/systemd.network.xml index 6dd38ea..5f0a703 100644 --- a/man/systemd.network.xml +++ b/man/systemd.network.xml @@ -795,7 +795,7 @@ Table=1234 IPForward= to one of ipv4, ipv6 or yes. Note. Any positive boolean values such as yes or - true are now deprecated. Please use one of the values in the above. + true are now deprecated. Please use one of the values above. @@ -3494,7 +3494,7 @@ Address=192.168.0.1/24 Address=192.168.0.2/24 [DHCPServer] ServerAddress=192.168.0.1/24 - are equivalent to the following. + are equivalent to the following: [Network] DHCPServer=yes Address=192.168.0.2/24 @@ -3856,7 +3856,7 @@ ServerAddress=192.168.0.1/24 Takes a timespan. Configures the retransmit time, used by clients to retransmit Neighbor Solicitation messages on address resolution and the Neighbor Unreachability Detection algorithm. - An integer the default unit of seconds, in the range 0…4294967295 msec. Defaults to 0. + An integer, the default unit is seconds, in the range 0…4294967295 msec. Defaults to 0. @@ -3945,8 +3945,8 @@ ServerAddress=192.168.0.1/24 HomeAgent= - Takes a boolean. Specifies that IPv6 router advertisements which indicates to hosts that - the router acts as a Home Agent and includes a Home Agent Option. Defaults to false. See + Takes a boolean. Specifies that IPv6 router advertisements which indicate to hosts that + the router acts as a Home Agent and includes a Home Agent option. Defaults to false. See RFC 6275 for further details. @@ -3956,7 +3956,7 @@ ServerAddress=192.168.0.1/24 HomeAgentLifetimeSec= - Takes a timespan. Specifies the lifetime of the Home Agent. An integer the default unit of seconds, + Takes a timespan. Specifies the lifetime of the Home Agent. An integer, the default unit is seconds, in the range 1…65535. Defaults to the value set to RouterLifetimeSec=. diff --git a/man/systemd.pcrlock.xml b/man/systemd.pcrlock.xml index 5687db5..ec08ffd 100644 --- a/man/systemd.pcrlock.xml +++ b/man/systemd.pcrlock.xml @@ -94,7 +94,7 @@ 250-firmware-code-early.pcrlock Firmware code measurements, as recorded to PCR 0 and 2, up to the separator - measurement (see 400-secureboot-separator.pcrlock. below). May be generated via + measurement (see 400-secureboot-separator.pcrlock below). May be generated via systemd-pcrlock lock-firmware-code. @@ -104,7 +104,7 @@ 250-firmware-config-early.pcrlock Firmware configuration measurements, as recorded to PCR 1 and 3, up to the separator - measurement (see 400-secureboot-separator.pcrlock. below). May be generated via + measurement (see 400-secureboot-separator.pcrlock below). May be generated via systemd-pcrlock lock-firmware-config. @@ -140,7 +140,7 @@ 550-firmware-code-late.pcrlock Firmware code measurements, as recorded to PCR 0 and 2, after the separator - measurement (see 400-secureboot-separator.pcrlock. above). May be generated via + measurement (see 400-secureboot-separator.pcrlock above). May be generated via systemd-pcrlock lock-firmware-code. @@ -150,7 +150,7 @@ 550-firmware-config-late.pcrlock Firmware configuration measurements, as recorded to PCR 1 and 3, after the separator - measurement (see 400-secureboot-separator.pcrlock. above). May be generated via + measurement (see 400-secureboot-separator.pcrlock above). May be generated via systemd-pcrlock lock-firmware-config. @@ -178,7 +178,7 @@ 700-action-efi-exit-boot-services.pcrlock The EFI action generated when ExitBootServices() is generated, - i.e. the UEFI environment is left and the OS takes over. Covers the PCR 5 measurement. Statically + i.e. when the UEFI environment is left and the OS takes over. Covers the PCR 5 measurement. Statically defined. diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml index 42f265c..5c61b74 100644 --- a/man/systemd.resource-control.xml +++ b/man/systemd.resource-control.xml @@ -749,6 +749,9 @@ CPUWeight=20 DisableControllers=cpu / \ The system default for this setting may be controlled with DefaultIPAccounting= in systemd-system.conf5. + Note that this functionality is currently only available for system services, not for + per-user services. + @@ -865,8 +868,10 @@ CPUWeight=20 DisableControllers=cpu / \ SocketBindDeny=bind-rule - Allow or deny binding a socket address to a socket by matching it with the bind-rule and - applying a corresponding action if there is a match. + Configures restrictions on the ability of unit processes to invoke bind2 on a + socket. Both allow and deny rules may defined that restrict which addresses a socket may be bound + to. bind-rule describes socket properties such as address-family, transport-protocol and ip-ports. @@ -913,6 +918,13 @@ CPUWeight=20 DisableControllers=cpu / \ The feature is implemented with cgroup/bind4 and cgroup/bind6 cgroup-bpf hooks. + + Note that these settings apply to any bind2 + system call invocation by the unit processes, regardless in which network namespace they are + placed. Or in other words: changing the network namespace is not a suitable mechanism for escaping + these restrictions on bind(). + Examples:… # Allow binding IPv6 socket addresses with a port greater than or equal to 10000. [Service] diff --git a/man/systemd.service.xml b/man/systemd.service.xml index a5f6179..86d7108 100644 --- a/man/systemd.service.xml +++ b/man/systemd.service.xml @@ -1365,7 +1365,7 @@ : - If the executable path is prefixed with :, environment variable substitution (as described by the "Command Lines" section below) is not applied. + If the executable path is prefixed with :, environment variable substitution (as described below this table) is not applied. @@ -1719,7 +1719,7 @@ SystemdService=simple-dbus-service.service Description=Simple notifying service [Service] -Type=notify +Type=notify-reload ExecStart=/usr/sbin/simple-notifying-service [Install] @@ -1737,6 +1737,16 @@ WantedBy=multi-user.target systemd.kill5 for details on how you can influence the way systemd terminates the service. + + To avoid code duplication, it is preferable to use + sd_notify3 + when possible, especially when other APIs provided by + libsystemd3 are + also used, but note that the notification protocol is very simple and guaranteed to be stable as per + the Interface Portability and Stability + Promise, so it can be reimplemented by services with no external dependencies. For a + self-contained example, see + sd_notify3. diff --git a/man/systemd.socket.xml b/man/systemd.socket.xml index 647b7db..73c8c5b 100644 --- a/man/systemd.socket.xml +++ b/man/systemd.socket.xml @@ -756,7 +756,7 @@ TCPCongestion= Takes a string value. Controls the TCP congestion algorithm used by this - socket. Should be one of westwood, veno, + socket. Should be one of westwood, reno, cubic, lp or any other available algorithm supported by the IP stack. This setting applies only to stream sockets. diff --git a/man/ukify.xml b/man/ukify.xml index 9b7e209..b882de8 100644 --- a/man/ukify.xml +++ b/man/ukify.xml @@ -476,8 +476,10 @@ SBAT metadata associated with the UKI or addon. SBAT policies are useful to revoke whole groups of UKIs or addons with a single, static policy update that does not take space in DBX/MOKX. If not specified manually, a default metadata entry consisting of - uki,1,UKI,uki,1,https://www.freedesktop.org/software/systemd/man/systemd-stub.html - will be used, to ensure it is always possible to revoke UKIs and addons. For more information on + uki,1,UKI,uki,1,https://uapi-group.org/specifications/specs/unified_kernel_image/ + for UKIs and + uki-addon,1,UKI Addon,addon,1,https://www.freedesktop.org/software/systemd/man/latest/systemd-stub.html + for addons will be used, to ensure it is always possible to revoke them. For more information on SBAT see Shim documentation. @@ -560,7 +562,7 @@ --initrd=early_cpio \ --initrd=/some/path/initramfs-6.0.9-300.fc37.x86_64.img \ --sbat='sbat,1,SBAT Version,sbat,1,https://github.com/rhboot/shim/blob/main/SBAT.md - uki.author.myimage,1,UKI for System,uki.author.myimage,1,https://www.freedesktop.org/software/systemd/man/systemd-stub.html' \ + uki.author.myimage,1,UKI for System,uki.author.myimage,1,https://uapi-group.org/specifications/specs/unified_kernel_image/' \ --pcr-private-key=pcr-private-initrd-key.pem \ --pcr-public-key=pcr-public-initrd-key.pem \ --phases='enter-initrd' \ @@ -633,7 +635,7 @@ $ ukify -c ukify.conf build \ --secureboot-certificate=sb.cert \ --cmdline='debug' \ --sbat='sbat,1,SBAT Version,sbat,1,https://github.com/rhboot/shim/blob/main/SBAT.md - uki.addon.author,1,UKI Addon for System,uki.addon.author,1,https://www.freedesktop.org/software/systemd/man/systemd-stub.html' + uki-addon.author,1,UKI Addon for System,uki-addon.author,1,https://www.freedesktop.org/software/systemd/man/systemd-stub.html' --output=debug.cmdline diff --git a/man/varlinkctl.xml b/man/varlinkctl.xml index 7dec54c..08d2312 100644 --- a/man/varlinkctl.xml +++ b/man/varlinkctl.xml @@ -50,7 +50,7 @@ call ADDRESS METHOD - PARAMETERS + ARGUMENTS @@ -100,7 +100,7 @@ info ADDRESS Show brief information about the specified service, including vendor name and list of - implemented interfaces. Expects a service address in the formats described above. + implemented interfaces. Expects a service address in one of the formats described above. @@ -109,7 +109,7 @@ list-interfaces ADDRESS Show list of interfaces implemented by the specified service. Expects a service - address in the formats described above. + address in one of the formats described above. @@ -118,7 +118,7 @@ introspect ADDRESS INTERFACE Show interface definition of the specified interface provided by the specified - service. Expects a service address in the formats described above and a Varlink interface + service. Expects a service address in one of the formats described above and a Varlink interface name. diff --git a/meson.build b/meson.build index a577ac7..554765b 100644 --- a/meson.build +++ b/meson.build @@ -220,6 +220,7 @@ conf.set_quoted('ENVIRONMENT_DIR', environmentdir) conf.set_quoted('INCLUDE_DIR', includedir) conf.set_quoted('LIBDIR', libdir) conf.set_quoted('LIBEXECDIR', libexecdir) +conf.set_quoted('KERNEL_INSTALL_DIR', kernelinstalldir) conf.set_quoted('MODPROBE_DIR', modprobedir) conf.set_quoted('MODULESLOAD_DIR', modulesloaddir) conf.set_quoted('PKGSYSCONFDIR', pkgsysconfdir) @@ -1676,8 +1677,8 @@ if conf.get('BPF_FRAMEWORK') == 1 bpf_gcc_flags = [ '-std=gnu11', '-fno-stack-protector', + '-fno-ssa-phiopt', '-O2', - '-mkernel=5.2', '-mcpu=v3', '-mco-re', '-gbtf', @@ -1726,7 +1727,7 @@ if conf.get('BPF_FRAMEWORK') == 1 bpf_o_unstripped_cmd += ['-I.'] - if not meson.is_cross_build() and bpf_compiler == 'clang' + if not meson.is_cross_build() target_triplet_cmd = run_command('gcc', '-dumpmachine', check: false) if target_triplet_cmd.returncode() == 0 target_triplet = target_triplet_cmd.stdout().strip() @@ -1812,6 +1813,7 @@ conf.set10('ENABLE_UKIFY', want_ukify) ############################################################ +check_efi_alignment_py = find_program('tools/check-efi-alignment.py') check_version_history_py = find_program('tools/check-version-history.py') elf2efi_py = find_program('tools/elf2efi.py') export_dbus_interfaces_py = find_program('tools/dbus_exporter.py') diff --git a/meson_options.txt b/meson_options.txt index 83b48ff..414b034 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -69,7 +69,7 @@ option('loadkeys-path', type : 'string', description : 'path to loadkeys') option('setfont-path', type : 'string', description : 'path to setfont') option('nologin-path', type : 'string', description : 'path to nologin') -option('debug-shell', type : 'string', value : '/usr/bin/sh', +option('debug-shell', type : 'string', value : '/bin/sh', description : 'path to debug shell binary') option('debug-tty', type : 'string', value : '/dev/tty9', description : 'specify the tty device for debug shell') @@ -236,7 +236,7 @@ option('time-epoch', type : 'integer', value : 0, description : 'time epoch for time clients') option('clock-valid-range-usec-max', type : 'integer', value : 473364000000000, # 15 years description : 'maximum value in microseconds for the difference between RTC and epoch, exceeding which is considered an RTC error ["0" disables]') -option('default-user-shell', type : 'string', value : '/usr/bin/bash', +option('default-user-shell', type : 'string', value : '/bin/bash', description : 'default interactive shell') option('system-alloc-uid-min', type : 'integer', value : 0, diff --git a/mkosi.images/base/mkosi.build.chroot b/mkosi.images/base/mkosi.build.chroot index f26098c..02dcbc7 100755 --- a/mkosi.images/base/mkosi.build.chroot +++ b/mkosi.images/base/mkosi.build.chroot @@ -193,12 +193,6 @@ if [ ! -f "$BUILDDIR"/build.ninja ]; then ) fi - if grep -q '^ID="opensuse' /usr/lib/os-release; then - CONFIGURE_OPTS+=( - -Dbpf-compiler=gcc - ) - fi - ( set -x; meson setup "$BUILDDIR" "$SRCDIR" "${CONFIGURE_OPTS[@]}" ) fi diff --git a/mkosi.images/base/mkosi.conf.d/10-opensuse.conf b/mkosi.images/base/mkosi.conf.d/10-opensuse.conf index ec91b49..5aae0ed 100644 --- a/mkosi.images/base/mkosi.conf.d/10-opensuse.conf +++ b/mkosi.images/base/mkosi.conf.d/10-opensuse.conf @@ -44,7 +44,6 @@ Packages= BuildPackages= audit-devel bpftool - cross-bpf-gcc13 dbus-1-devel fdupes gcc-c++ diff --git a/po/POTFILES.skip b/po/POTFILES.skip index c9cb70c..befe6a0 100644 --- a/po/POTFILES.skip +++ b/po/POTFILES.skip @@ -22,7 +22,10 @@ src/hostname/hostnamed.c src/locale/localed.c src/timedate/timedated.c units/debug-shell.service.in +units/systemd-battery-check.service.in +units/systemd-bootctl@.service.in units/systemd-journald.service.in units/systemd-pcrextend@.service.in +units/systemd-pcrlock@.service.in units/systemd-timesyncd.service.in units/user@.service.in diff --git a/shell-completion/zsh/_journalctl b/shell-completion/zsh/_journalctl index 5dba1e7..b77bf3f 100644 --- a/shell-completion/zsh/_journalctl +++ b/shell-completion/zsh/_journalctl @@ -59,7 +59,7 @@ _journalctl_boots() { (( $+functions[_journalctl_facilities] )) || _journalctl_facilities() { local -a _journalctl_facilities - _journalctl_facilities=(kern user mail daemon auth syslog lpr news uucp cron authpriv ftp local0 local1 local2 local3 local4 local5 local6 local7) + _journalctl_facilities=(help kern user mail daemon auth syslog lpr news uucp cron authpriv ftp local0 local1 local2 local3 local4 local5 local6 local7) _describe 'possible values' _journalctl_facilities } @@ -113,10 +113,12 @@ _arguments -s \ '--facility=[Filter messages by facility]:facility:_journalctl_facilities' \ {-t+,--identifier=}'[Filter messages by syslog identifier]:identifier:_journalctl_field_values SYSLOG_IDENTIFIER' \ {-c+,--cursor=}'[Start showing entries from the specified cursor]:cursors:_journalctl_field_values __CURSORS' \ - '--cursor-file=[Show entries using cursor store in file]:file:_files' \ + '--cursor-file=[Show entries using cursor stored in file]:file:_files' \ '--after-cursor=[Start showing entries from after the specified cursor]:cursors:_journalctl_field_values __CURSORS' \ '--since=[Start showing entries on or newer than the specified date]:YYYY-MM-DD HH\:MM\:SS' \ '--until=[Stop showing entries on or older than the specified date]:YYYY-MM-DD HH\:MM\:SS' \ + {-g+,--grep=}'[Show entries with MESSAGE field matching PCRE pattern]' \ + '--case-sensitive=[Force case sensitive or insensitive matching]:boolean:(true false)' \ {-F,--field=}'[List all values a certain field takes]:Fields:_journalctl_fields' \ '--system[Show system and kernel messages]' \ '--user[Show messages from user services]' \ diff --git a/shell-completion/zsh/_udevadm b/shell-completion/zsh/_udevadm index 37e9f28..6d31398 100644 --- a/shell-completion/zsh/_udevadm +++ b/shell-completion/zsh/_udevadm @@ -167,7 +167,7 @@ _udevadm_commands(){ 'test-builtin:test a built-in command' 'verify:verify udev rules files' 'wait:wait for devices or device symlinks being created' - 'lock:lock a block device and run a comand' + 'lock:lock a block device and run a command' ) if ((CURRENT == 1)); then diff --git a/src/analyze/analyze-srk.c b/src/analyze/analyze-srk.c index 0e24b41..6faf2c2 100644 --- a/src/analyze/analyze-srk.c +++ b/src/analyze/analyze-srk.c @@ -11,9 +11,9 @@ int verb_srk(int argc, char *argv[], void *userdata) { _cleanup_(Esys_Freep) TPM2B_PUBLIC *public = NULL; int r; - r = tpm2_context_new(/* device= */ NULL, &c); + r = tpm2_context_new_or_warn(/* device= */ NULL, &c); if (r < 0) - return log_error_errno(r, "Failed to create TPM2 context: %m"); + return r; r = tpm2_get_srk( c, diff --git a/src/backlight/backlight.c b/src/backlight/backlight.c index 5ac9f90..b2032ad 100644 --- a/src/backlight/backlight.c +++ b/src/backlight/backlight.c @@ -55,6 +55,10 @@ static int has_multiple_graphics_cards(void) { if (r < 0) return r; + r = sd_device_enumerator_allow_uninitialized(e); + if (r < 0) + return r; + r = sd_device_enumerator_add_match_subsystem(e, "pci", /* match = */ true); if (r < 0) return r; diff --git a/src/basic/chase.c b/src/basic/chase.c index 26bc2d6..9f5477e 100644 --- a/src/basic/chase.c +++ b/src/basic/chase.c @@ -374,11 +374,11 @@ int chaseat(int dir_fd, const char *path, ChaseFlags flags, char **ret_path, int return r; if (FLAGS_SET(flags, CHASE_MKDIR_0755) && !isempty(todo)) { - child = xopenat(fd, - first, - O_DIRECTORY|O_CREAT|O_EXCL|O_NOFOLLOW|O_CLOEXEC, - /* xopen_flags = */ 0, - 0755); + child = xopenat_full(fd, + first, + O_DIRECTORY|O_CREAT|O_EXCL|O_NOFOLLOW|O_CLOEXEC, + /* xopen_flags = */ 0, + 0755); if (child < 0) return child; } else if (FLAGS_SET(flags, CHASE_PARENT) && isempty(todo)) { @@ -760,10 +760,10 @@ int chase_and_open(const char *path, const char *root, ChaseFlags chase_flags, i if (empty_or_root(root) && !ret_path && (chase_flags & (CHASE_NO_AUTOFS|CHASE_SAFE|CHASE_PROHIBIT_SYMLINKS|CHASE_PARENT|CHASE_MKDIR_0755)) == 0) /* Shortcut this call if none of the special features of this call are requested */ - return xopenat(AT_FDCWD, path, - open_flags | (FLAGS_SET(chase_flags, CHASE_NOFOLLOW) ? O_NOFOLLOW : 0), - /* xopen_flags = */ 0, - mode); + return xopenat_full(AT_FDCWD, path, + open_flags | (FLAGS_SET(chase_flags, CHASE_NOFOLLOW) ? O_NOFOLLOW : 0), + /* xopen_flags = */ 0, + mode); r = chase(path, root, CHASE_PARENT|chase_flags, &p, &path_fd); if (r < 0) @@ -777,7 +777,7 @@ int chase_and_open(const char *path, const char *root, ChaseFlags chase_flags, i return r; } - r = xopenat(path_fd, strempty(fname), open_flags|O_NOFOLLOW, /* xopen_flags = */ 0, mode); + r = xopenat_full(path_fd, strempty(fname), open_flags|O_NOFOLLOW, /* xopen_flags = */ 0, mode); if (r < 0) return r; @@ -964,10 +964,10 @@ int chase_and_openat(int dir_fd, const char *path, ChaseFlags chase_flags, int o if (dir_fd == AT_FDCWD && !ret_path && (chase_flags & (CHASE_NO_AUTOFS|CHASE_SAFE|CHASE_PROHIBIT_SYMLINKS|CHASE_PARENT|CHASE_MKDIR_0755)) == 0) /* Shortcut this call if none of the special features of this call are requested */ - return xopenat(dir_fd, path, - open_flags | (FLAGS_SET(chase_flags, CHASE_NOFOLLOW) ? O_NOFOLLOW : 0), - /* xopen_flags = */ 0, - mode); + return xopenat_full(dir_fd, path, + open_flags | (FLAGS_SET(chase_flags, CHASE_NOFOLLOW) ? O_NOFOLLOW : 0), + /* xopen_flags = */ 0, + mode); r = chaseat(dir_fd, path, chase_flags|CHASE_PARENT, &p, &path_fd); if (r < 0) @@ -979,7 +979,7 @@ int chase_and_openat(int dir_fd, const char *path, ChaseFlags chase_flags, int o return r; } - r = xopenat(path_fd, strempty(fname), open_flags|O_NOFOLLOW, /* xopen_flags = */ 0, mode); + r = xopenat_full(path_fd, strempty(fname), open_flags|O_NOFOLLOW, /* xopen_flags = */ 0, mode); if (r < 0) return r; diff --git a/src/basic/chattr-util.c b/src/basic/chattr-util.c index fe8b9ab..d76be5c 100644 --- a/src/basic/chattr-util.c +++ b/src/basic/chattr-util.c @@ -29,7 +29,7 @@ int chattr_full( assert(dir_fd >= 0 || dir_fd == AT_FDCWD); - fd = xopenat(dir_fd, path, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, /* xopen_flags = */ 0, /* mode = */ 0); + fd = xopenat(dir_fd, path, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW); if (fd < 0) return fd; diff --git a/src/basic/env-util.c b/src/basic/env-util.c index d3bf733..a97651d 100644 --- a/src/basic/env-util.c +++ b/src/basic/env-util.c @@ -983,8 +983,8 @@ int putenv_dup(const char *assignment, bool override) { } int setenv_systemd_exec_pid(bool update_only) { - char str[DECIMAL_STR_MAX(pid_t)]; const char *e; + int r; /* Update $SYSTEMD_EXEC_PID=pid except when '*' is set for the variable. */ @@ -995,10 +995,9 @@ int setenv_systemd_exec_pid(bool update_only) { if (streq_ptr(e, "*")) return 0; - xsprintf(str, PID_FMT, getpid_cached()); - - if (setenv("SYSTEMD_EXEC_PID", str, 1) < 0) - return -errno; + r = setenvf("SYSTEMD_EXEC_PID", /* overwrite= */ 1, PID_FMT, getpid_cached()); + if (r < 0) + return r; return 1; } @@ -1093,3 +1092,25 @@ int set_full_environment(char **env) { return 0; } + +int setenvf(const char *name, bool overwrite, const char *valuef, ...) { + _cleanup_free_ char *value = NULL; + va_list ap; + int r; + + assert(name); + + if (!valuef) + return RET_NERRNO(unsetenv(name)); + + va_start(ap, valuef); + DISABLE_WARNING_FORMAT_NONLITERAL; + r = vasprintf(&value, valuef, ap); + REENABLE_WARNING; + va_end(ap); + + if (r < 0) + return -ENOMEM; + + return RET_NERRNO(setenv(name, value, overwrite)); +} diff --git a/src/basic/env-util.h b/src/basic/env-util.h index f7fb1e9..34cf1f9 100644 --- a/src/basic/env-util.h +++ b/src/basic/env-util.h @@ -79,3 +79,5 @@ int getenv_path_list(const char *name, char ***ret_paths); int getenv_steal_erase(const char *name, char **ret); int set_full_environment(char **env); + +int setenvf(const char *name, bool overwrite, const char *valuef, ...) _printf_(3,4); diff --git a/src/basic/filesystems-gperf.gperf b/src/basic/filesystems-gperf.gperf index e8c5357..1cd66b5 100644 --- a/src/basic/filesystems-gperf.gperf +++ b/src/basic/filesystems-gperf.gperf @@ -91,6 +91,7 @@ ocfs2, {OCFS2_SUPER_MAGIC} openpromfs, {OPENPROM_SUPER_MAGIC} orangefs, {ORANGEFS_DEVREQ_MAGIC} overlay, {OVERLAYFS_SUPER_MAGIC} +pidfs, {PID_FS_MAGIC} pipefs, {PIPEFS_MAGIC} ppc-cmm, {PPC_CMM_MAGIC} proc, {PROC_SUPER_MAGIC} diff --git a/src/basic/fs-util.c b/src/basic/fs-util.c index 9ba9268..5bc7d2f 100644 --- a/src/basic/fs-util.c +++ b/src/basic/fs-util.c @@ -1054,7 +1054,7 @@ int open_mkdir_at(int dirfd, const char *path, int flags, mode_t mode) { path = fname; } - fd = xopenat(dirfd, path, flags|O_CREAT|O_DIRECTORY|O_NOFOLLOW, /* xopen_flags = */ 0, mode); + fd = xopenat_full(dirfd, path, flags|O_CREAT|O_DIRECTORY|O_NOFOLLOW, /* xopen_flags = */ 0, mode); if (IN_SET(fd, -ELOOP, -ENOTDIR)) return -EEXIST; if (fd < 0) @@ -1110,7 +1110,7 @@ int openat_report_new(int dirfd, const char *pathname, int flags, mode_t mode, b } } -int xopenat(int dir_fd, const char *path, int open_flags, XOpenFlags xopen_flags, mode_t mode) { +int xopenat_full(int dir_fd, const char *path, int open_flags, XOpenFlags xopen_flags, mode_t mode) { _cleanup_close_ int fd = -EBADF; bool made = false; int r; @@ -1191,7 +1191,7 @@ int xopenat(int dir_fd, const char *path, int open_flags, XOpenFlags xopen_flags return TAKE_FD(fd); } -int xopenat_lock( +int xopenat_lock_full( int dir_fd, const char *path, int open_flags, @@ -1214,7 +1214,7 @@ int xopenat_lock( for (;;) { struct stat st; - fd = xopenat(dir_fd, path, open_flags, xopen_flags, mode); + fd = xopenat_full(dir_fd, path, open_flags, xopen_flags, mode); if (fd < 0) return fd; diff --git a/src/basic/fs-util.h b/src/basic/fs-util.h index 1023ab7..6a1e2e7 100644 --- a/src/basic/fs-util.h +++ b/src/basic/fs-util.h @@ -137,6 +137,12 @@ typedef enum XOpenFlags { XO_SUBVOLUME = 1 << 1, } XOpenFlags; -int xopenat(int dir_fd, const char *path, int open_flags, XOpenFlags xopen_flags, mode_t mode); +int xopenat_full(int dir_fd, const char *path, int open_flags, XOpenFlags xopen_flags, mode_t mode); +static inline int xopenat(int dir_fd, const char *path, int open_flags) { + return xopenat_full(dir_fd, path, open_flags, 0, 0); +} -int xopenat_lock(int dir_fd, const char *path, int open_flags, XOpenFlags xopen_flags, mode_t mode, LockType locktype, int operation); +int xopenat_lock_full(int dir_fd, const char *path, int open_flags, XOpenFlags xopen_flags, mode_t mode, LockType locktype, int operation); +static inline int xopenat_lock(int dir_fd, const char *path, int open_flags, LockType locktype, int operation) { + return xopenat_lock_full(dir_fd, path, open_flags, 0, 0, locktype, operation); +} diff --git a/src/basic/hashmap.h b/src/basic/hashmap.h index 233f1d7..d0ebdf5 100644 --- a/src/basic/hashmap.h +++ b/src/basic/hashmap.h @@ -39,8 +39,8 @@ typedef struct IteratedCache IteratedCache; /* Caches the iterated order of on * by hashmap users, so the definition has to be here. Do not use its fields * directly. */ typedef struct { - unsigned idx; /* index of an entry to be iterated next */ const void *next_key; /* expected value of that entry's key pointer */ + unsigned idx; /* index of an entry to be iterated next */ #if ENABLE_DEBUG_HASHMAP unsigned put_count; /* hashmap's put_count recorded at start of iteration */ unsigned rem_count; /* hashmap's rem_count in previous iteration */ diff --git a/src/basic/lock-util.c b/src/basic/lock-util.c index 047fd01..7bffe85 100644 --- a/src/basic/lock-util.c +++ b/src/basic/lock-util.c @@ -40,13 +40,13 @@ int make_lock_file_at(int dir_fd, const char *p, int operation, LockFile *ret) { if (!t) return -ENOMEM; - fd = xopenat_lock(dfd, - p, - O_CREAT|O_RDWR|O_NOFOLLOW|O_CLOEXEC|O_NOCTTY, - /* xopen_flags = */ 0, - 0600, - LOCK_UNPOSIX, - operation); + fd = xopenat_lock_full(dfd, + p, + O_CREAT|O_RDWR|O_NOFOLLOW|O_CLOEXEC|O_NOCTTY, + /* xopen_flags = */ 0, + 0600, + LOCK_UNPOSIX, + operation); if (fd < 0) return fd == -EAGAIN ? -EBUSY : fd; diff --git a/src/basic/log.c b/src/basic/log.c index 1470611..7a44300 100644 --- a/src/basic/log.c +++ b/src/basic/log.c @@ -427,6 +427,8 @@ static int write_to_console( const char *func, const char *buffer) { + static int dumb = -1; + char location[256], header_time[FORMAT_TIMESTAMP_MAX], prefix[1 + DECIMAL_STR_MAX(int) + 2], @@ -438,6 +440,9 @@ static int write_to_console( if (console_fd < 0) return 0; + if (dumb < 0) + dumb = getenv_terminal_is_dumb(); + if (log_target == LOG_TARGET_CONSOLE_PREFIXED) { xsprintf(prefix, "<%i>", level); iovec[n++] = IOVEC_MAKE_STRING(prefix); @@ -481,8 +486,9 @@ static int write_to_console( /* When writing to a TTY we output an extra '\r' (i.e. CR) first, to generate CRNL rather than just * NL. This is a robustness thing in case the TTY is currently in raw mode (specifically: has the * ONLCR flag off). We want that subsequent output definitely starts at the beginning of the line - * again, after all. If the TTY is not in raw mode the extra CR should not hurt. */ - iovec[n++] = IOVEC_MAKE_STRING(check_console_fd_is_tty() ? "\r\n" : "\n"); + * again, after all. If the TTY is not in raw mode the extra CR should not hurt. If we're writing to + * a dumb terminal, only write NL as CRNL might be interpreted as a double newline. */ + iovec[n++] = IOVEC_MAKE_STRING(check_console_fd_is_tty() && !dumb ? "\r\n" : "\n"); if (writev(console_fd, iovec, n) < 0) { diff --git a/src/basic/meson.build b/src/basic/meson.build index d7450d8..111253e 100644 --- a/src/basic/meson.build +++ b/src/basic/meson.build @@ -235,7 +235,7 @@ filesystem_includes = ['linux/magic.h', check_filesystems = find_program('check-filesystems.sh') r = run_command([check_filesystems, cpp, files('filesystems-gperf.gperf')] + filesystem_includes, check: false) if r.returncode() != 0 - error('Unknown filesystems defined in kernel headers:\n\n' + r.stdout()) + warning('Unknown filesystems defined in kernel headers:\n\n' + r.stdout()) endif filesystems_gperf_h = custom_target( diff --git a/src/basic/missing_magic.h b/src/basic/missing_magic.h index 27a33ad..82d71c8 100644 --- a/src/basic/missing_magic.h +++ b/src/basic/missing_magic.h @@ -128,6 +128,11 @@ #define DEVMEM_MAGIC 0x454d444d #endif +/* cb12fd8e0dabb9a1c8aef55a6a41e2c255fcdf4b (6.8) */ +#ifndef PID_FS_MAGIC +#define PID_FS_MAGIC 0x50494446 +#endif + /* Not in mainline but included in Ubuntu */ #ifndef SHIFTFS_MAGIC #define SHIFTFS_MAGIC 0x6a656a62 diff --git a/src/basic/os-util.c b/src/basic/os-util.c index dbd067f..985d89b 100644 --- a/src/basic/os-util.c +++ b/src/basic/os-util.c @@ -61,6 +61,39 @@ bool image_name_is_valid(const char *s) { return true; } +int path_extract_image_name(const char *path, char **ret) { + _cleanup_free_ char *fn = NULL; + int r; + + assert(path); + assert(ret); + + /* Extract last component from path, without any "/" suffixes. */ + r = path_extract_filename(path, &fn); + if (r < 0) + return r; + + if (r != O_DIRECTORY) { + /* Chop off any image suffixes we recognize (unless we already know this must refer to some dir */ + FOREACH_STRING(suffix, ".sysext.raw", ".confext.raw", ".raw") { + char *m = endswith(fn, suffix); + if (m) { + *m = 0; + break; + } + } + } + + /* Truncate the version/counting suffixes */ + fn[strcspn(fn, "_+")] = 0; + + if (!image_name_is_valid(fn)) + return -EINVAL; + + *ret = TAKE_PTR(fn); + return 0; +} + int path_is_extension_tree(ImageClass image_class, const char *path, const char *extension, bool relax_extension_release_check) { int r; @@ -230,9 +263,25 @@ int open_extension_release_at( continue; } - if (!relax_extension_release_check && - extension_release_strict_xattr_value(fd, dir_path, de->d_name) != 0) - continue; + if (!relax_extension_release_check) { + _cleanup_free_ char *base_image_name = NULL, *base_extension = NULL; + + r = path_extract_image_name(image_name, &base_image_name); + if (r < 0) { + log_debug_errno(r, "Failed to extract image name from %s/%s, ignoring: %m", dir_path, de->d_name); + continue; + } + + r = path_extract_image_name(extension, &base_extension); + if (r < 0) { + log_debug_errno(r, "Failed to extract image name from %s, ignoring: %m", extension); + continue; + } + + if (!streq(base_image_name, base_extension) && + extension_release_strict_xattr_value(fd, dir_path, image_name) != 0) + continue; + } /* We already found what we were looking for, but there's another candidate? We treat this as * an error, as we want to enforce that there are no ambiguities in case we are in the diff --git a/src/basic/os-util.h b/src/basic/os-util.h index 7cee3dd..f6a12a3 100644 --- a/src/basic/os-util.h +++ b/src/basic/os-util.h @@ -25,6 +25,7 @@ ImageClass image_class_from_string(const char *s) _pure_; * in accordance with the OS extension specification, rather than for /usr/lib/ or /etc/os-release. */ bool image_name_is_valid(const char *s) _pure_; +int path_extract_image_name(const char *path, char **ret); int path_is_extension_tree(ImageClass image_class, const char *path, const char *extension, bool relax_extension_release_check); static inline int path_is_os_tree(const char *path) { diff --git a/src/basic/stat-util.c b/src/basic/stat-util.c index c54374b..581370d 100644 --- a/src/basic/stat-util.c +++ b/src/basic/stat-util.c @@ -262,11 +262,31 @@ int path_is_network_fs(const char *path) { return is_network_fs(&s); } +int stat_verify_linked(const struct stat *st) { + assert(st); + + if (st->st_nlink <= 0) + return -EIDRM; /* recognizable error. */ + + return 0; +} + +int fd_verify_linked(int fd) { + struct stat st; + + assert(fd >= 0); + + if (fstat(fd, &st) < 0) + return -errno; + + return stat_verify_linked(&st); +} + int stat_verify_regular(const struct stat *st) { assert(st); - /* Checks whether the specified stat() structure refers to a regular file. If not returns an appropriate error - * code. */ + /* Checks whether the specified stat() structure refers to a regular file. If not returns an + * appropriate error code. */ if (S_ISDIR(st->st_mode)) return -EISDIR; @@ -470,7 +490,7 @@ int xstatfsat(int dir_fd, const char *path, struct statfs *ret) { assert(dir_fd >= 0 || dir_fd == AT_FDCWD); assert(ret); - fd = xopenat(dir_fd, path, O_PATH|O_CLOEXEC|O_NOCTTY, /* xopen_flags = */ 0, /* mode = */ 0); + fd = xopenat(dir_fd, path, O_PATH|O_CLOEXEC|O_NOCTTY); if (fd < 0) return fd; diff --git a/src/basic/stat-util.h b/src/basic/stat-util.h index ae0aaf8..3501406 100644 --- a/src/basic/stat-util.h +++ b/src/basic/stat-util.h @@ -71,6 +71,9 @@ int path_is_network_fs(const char *path); */ #define F_TYPE_EQUAL(a, b) (a == (typeof(a)) b) +int stat_verify_linked(const struct stat *st); +int fd_verify_linked(int fd); + int stat_verify_regular(const struct stat *st); int fd_verify_regular(int fd); int verify_regular_at(int dir_fd, const char *path, bool follow); diff --git a/src/basic/terminal-util.c b/src/basic/terminal-util.c index 3355b74..530ef9a 100644 --- a/src/basic/terminal-util.c +++ b/src/basic/terminal-util.c @@ -1300,7 +1300,7 @@ static bool on_dev_null(void) { return cached_on_dev_null; } -static bool getenv_terminal_is_dumb(void) { +bool getenv_terminal_is_dumb(void) { const char *e; e = getenv("TERM"); diff --git a/src/basic/terminal-util.h b/src/basic/terminal-util.h index 2a7d48b..b1d7aee 100644 --- a/src/basic/terminal-util.h +++ b/src/basic/terminal-util.h @@ -160,6 +160,7 @@ void columns_lines_cache_reset(int _unused_ signum); void reset_terminal_feature_caches(void); bool on_tty(void); +bool getenv_terminal_is_dumb(void); bool terminal_is_dumb(void); ColorMode get_color_mode(void); bool underline_enabled(void); @@ -186,7 +187,7 @@ static inline bool colors_enabled(void) { } static inline const char *ansi_underline(void) { - return underline_enabled() ? ANSI_UNDERLINE : ANSI_NORMAL; + return underline_enabled() ? ANSI_UNDERLINE : ""; } #define DEFINE_ANSI_FUNC_UNDERLINE(name, NAME) \ diff --git a/src/basic/virt.c b/src/basic/virt.c index 93ccfaa..88357a9 100644 --- a/src/basic/virt.c +++ b/src/basic/virt.c @@ -178,6 +178,7 @@ static Virtualization detect_vm_dmi_vendor(void) { { "VMW", VIRTUALIZATION_VMWARE }, { "innotek GmbH", VIRTUALIZATION_ORACLE }, { "VirtualBox", VIRTUALIZATION_ORACLE }, + { "Oracle Corporation", VIRTUALIZATION_ORACLE }, /* Detect VirtualBox on some proprietary systems via the board_vendor */ { "Xen", VIRTUALIZATION_XEN }, { "Bochs", VIRTUALIZATION_BOCHS }, { "Parallels", VIRTUALIZATION_PARALLELS }, diff --git a/src/boot/efi/boot.c b/src/boot/efi/boot.c index 5c0f0ab..a3d5607 100644 --- a/src/boot/efi/boot.c +++ b/src/boot/efi/boot.c @@ -2250,9 +2250,9 @@ static EFI_STATUS initrd_prepare( assert(ret_initrd_size); if (entry->type != LOADER_LINUX || !entry->initrd) { - ret_options = NULL; - ret_initrd = NULL; - ret_initrd_size = 0; + *ret_options = NULL; + *ret_initrd = NULL; + *ret_initrd_size = 0; return EFI_SUCCESS; } diff --git a/src/boot/efi/cpio.c b/src/boot/efi/cpio.c index 5b90e17..c4f803c 100644 --- a/src/boot/efi/cpio.c +++ b/src/boot/efi/cpio.c @@ -65,7 +65,7 @@ static EFI_STATUS pack_cpio_one( char *a; assert(fname); - assert(contents_size || contents_size == 0); + assert(contents || contents_size == 0); assert(target_dir_prefix); assert(inode_counter); assert(cpio_buffer); diff --git a/src/boot/efi/meson.build b/src/boot/efi/meson.build index c95132e..43727ef 100644 --- a/src/boot/efi/meson.build +++ b/src/boot/efi/meson.build @@ -404,6 +404,11 @@ foreach efi_elf_binary : efi_elf_binaries if name == 'addon@0@.efi.stub'.format(efi_arch) efi_addon = exe.full_path() endif + + test('check-alignment-@0@'.format(name), + check_efi_alignment_py, + args : exe.full_path(), + suite : 'efi') endforeach alias_target('systemd-boot', boot_targets) diff --git a/src/boot/efi/stub.c b/src/boot/efi/stub.c index 7ef3e76..0d9df7e 100644 --- a/src/boot/efi/stub.c +++ b/src/boot/efi/stub.c @@ -540,6 +540,10 @@ static EFI_STATUS run(EFI_HANDLE image) { CLEANUP_ARRAY(dt_filenames_addons_global, n_dts_addons_global, dt_filenames_free); CLEANUP_ARRAY(dt_filenames_addons_uki, n_dts_addons_uki, dt_filenames_free); + if (szs[UNIFIED_SECTION_UNAME] > 0) + uname = xstrndup8((char *)loaded_image->ImageBase + addrs[UNIFIED_SECTION_UNAME], + szs[UNIFIED_SECTION_UNAME]); + /* Now that we have the UKI sections loaded, also load global first and then local (per-UKI) * addons. The data is loaded at once, and then used later. */ err = load_addons( @@ -614,10 +618,6 @@ static EFI_STATUS run(EFI_HANDLE image) { /* Show splash screen as early as possible */ graphics_splash((const uint8_t*) loaded_image->ImageBase + addrs[UNIFIED_SECTION_SPLASH], szs[UNIFIED_SECTION_SPLASH]); - if (szs[UNIFIED_SECTION_UNAME] > 0) - uname = xstrndup8((char *)loaded_image->ImageBase + addrs[UNIFIED_SECTION_UNAME], - szs[UNIFIED_SECTION_UNAME]); - if (use_load_options(image, loaded_image, szs[UNIFIED_SECTION_CMDLINE] > 0, &cmdline)) { /* Let's measure the passed kernel command line into the TPM. Note that this possibly * duplicates what we already did in the boot menu, if that was already used. However, since diff --git a/src/busctl/busctl.c b/src/busctl/busctl.c index 39d22f2..01cb896 100644 --- a/src/busctl/busctl.c +++ b/src/busctl/busctl.c @@ -2021,6 +2021,15 @@ static int call(int argc, char **argv, void *userdata) { if (r < 0) return r; + if (!service_name_is_valid(argv[1])) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid service name: %s", argv[1]); + if (!object_path_is_valid(argv[2])) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid object path: %s", argv[2]); + if (!interface_name_is_valid(argv[3])) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid interface name: %s", argv[3]); + if (!member_name_is_valid(argv[4])) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid member name: %s", argv[4]); + r = sd_bus_message_new_method_call(bus, &m, argv[1], argv[2], argv[3], argv[4]); if (r < 0) return bus_log_create_error(r); diff --git a/src/core/bpf-socket-bind.c b/src/core/bpf-socket-bind.c index 9f290ab..88ab487 100644 --- a/src/core/bpf-socket-bind.c +++ b/src/core/bpf-socket-bind.c @@ -32,6 +32,15 @@ static int update_rules_map( assert(map_fd >= 0); + if (!head) { + static const struct socket_bind_rule val = { + .address_family = SOCKET_BIND_RULE_AF_MATCH_NOTHING, + }; + + if (sym_bpf_map_update_elem(map_fd, &i, &val, BPF_ANY) != 0) + return -errno; + } + LIST_FOREACH(socket_bind_items, item, head) { struct socket_bind_rule val = { .address_family = (uint32_t) item->address_family, diff --git a/src/core/bpf/socket_bind/socket-bind-api.bpf.h b/src/core/bpf/socket_bind/socket-bind-api.bpf.h index 277b9bb..4fe08f1 100644 --- a/src/core/bpf/socket_bind/socket-bind-api.bpf.h +++ b/src/core/bpf/socket_bind/socket-bind-api.bpf.h @@ -7,13 +7,17 @@ */ #include +#include /* * Bind rule is matched with socket fields accessible to cgroup/bind{4,6} hook * through bpf_sock_addr struct. - * 'address_family' is expected to be one of AF_UNSPEC, AF_INET or AF_INET6. + * 'address_family' is expected to be one of AF_UNSPEC, AF_INET, AF_INET6 or the + * magic SOCKET_BIND_RULE_AF_MATCH_NOTHING. * Matching by family is bypassed for rules with AF_UNSPEC set, which makes the * rest of a rule applicable for both IPv4 and IPv6 addresses. + * If SOCKET_BIND_RULE_AF_MATCH_NOTHING is set the rule fails unconditionally + * and other checks are skipped. * If matching by family is either successful or bypassed, a rule and a socket * are matched by ip protocol. * If 'protocol' is 0, matching is bypassed. @@ -49,3 +53,4 @@ struct socket_bind_rule { }; #define SOCKET_BIND_MAX_RULES 128 +#define SOCKET_BIND_RULE_AF_MATCH_NOTHING UINT32_MAX diff --git a/src/core/bpf/socket_bind/socket-bind.bpf.c b/src/core/bpf/socket_bind/socket-bind.bpf.c index b7972a8..da9f9d1 100644 --- a/src/core/bpf/socket_bind/socket-bind.bpf.c +++ b/src/core/bpf/socket_bind/socket-bind.bpf.c @@ -55,6 +55,9 @@ static __always_inline bool match( __u32 protocol, __u16 port, const struct socket_bind_rule *r) { + if (r->address_family == SOCKET_BIND_RULE_AF_MATCH_NOTHING) + return false; + return match_af(address_family, r) && match_protocol(protocol, r) && match_user_port(port, r); diff --git a/src/core/dynamic-user.c b/src/core/dynamic-user.c index 12724c6..2bf9094 100644 --- a/src/core/dynamic-user.c +++ b/src/core/dynamic-user.c @@ -337,8 +337,10 @@ static int dynamic_user_pop(DynamicUser *d, uid_t *ret_uid, int *ret_lock_fd) { * the lock on the socket taken. */ k = receive_one_fd_iov(d->storage_socket[0], &iov, 1, MSG_DONTWAIT, &lock_fd); - if (k < 0) + if (k < 0) { + assert(errno_is_valid(-k)); return (int) k; + } *ret_uid = uid; *ret_lock_fd = lock_fd; diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index 28d6142..8e6de15 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -3459,7 +3459,7 @@ static int close_remaining_fds( const int *fds, size_t n_fds) { size_t n_dont_close = 0; - int dont_close[n_fds + 14]; + int dont_close[n_fds + 15]; assert(params); @@ -3495,6 +3495,8 @@ static int close_remaining_fds( if (params->user_lookup_fd >= 0) dont_close[n_dont_close++] = params->user_lookup_fd; + assert(n_dont_close <= ELEMENTSOF(dont_close)); + return close_all_fds(dont_close, n_dont_close); } diff --git a/src/core/main.c b/src/core/main.c index 3f71cc0..1ed968d 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -627,7 +627,7 @@ static int parse_config_file(void) { { "Manager", "CPUAffinity", config_parse_cpu_affinity2, 0, &arg_cpu_affinity }, { "Manager", "NUMAPolicy", config_parse_numa_policy, 0, &arg_numa_policy.type }, { "Manager", "NUMAMask", config_parse_numa_mask, 0, &arg_numa_policy }, - { "Manager", "JoinControllers", config_parse_warn_compat, DISABLED_CONFIGURATION, NULL }, + { "Manager", "JoinControllers", config_parse_warn_compat, DISABLED_LEGACY, NULL }, { "Manager", "RuntimeWatchdogSec", config_parse_watchdog_sec, 0, &arg_runtime_watchdog }, { "Manager", "RuntimeWatchdogPreSec", config_parse_watchdog_sec, 0, &arg_pretimeout_watchdog }, { "Manager", "RebootWatchdogSec", config_parse_watchdog_sec, 0, &arg_reboot_watchdog }, diff --git a/src/core/manager-serialize.c b/src/core/manager-serialize.c index e9d567a..1ac2636 100644 --- a/src/core/manager-serialize.c +++ b/src/core/manager-serialize.c @@ -153,6 +153,7 @@ int manager_serialize( } (void) serialize_ratelimit(f, "dump-ratelimit", &m->dump_ratelimit); + (void) serialize_ratelimit(f, "reload-ratelimit", &m->reload_ratelimit); bus_track_serialize(m->subscribed, f, "subscribed"); @@ -515,6 +516,8 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) { (void) varlink_server_deserialize_one(m->varlink_server, val, fds); } else if ((val = startswith(l, "dump-ratelimit="))) deserialize_ratelimit(&m->dump_ratelimit, "dump-ratelimit", val); + else if ((val = startswith(l, "reload-ratelimit="))) + deserialize_ratelimit(&m->reload_ratelimit, "reload-ratelimit", val); else { ManagerTimestamp q; diff --git a/src/core/mount.c b/src/core/mount.c index ded322d..3c4971c 100644 --- a/src/core/mount.c +++ b/src/core/mount.c @@ -55,7 +55,7 @@ static const UnitActiveState state_translation_table[_MOUNT_STATE_MAX] = { static int mount_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata); static int mount_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata); -static void mount_enter_dead(Mount *m, MountResult f); +static void mount_enter_dead(Mount *m, MountResult f, bool flush_result); static void mount_enter_mounted(Mount *m, MountResult f); static void mount_cycle_clear(Mount *m); static int mount_process_proc_self_mountinfo(Manager *m); @@ -846,7 +846,7 @@ static void mount_catchup(Unit *u) { break; case MOUNT_MOUNTED: assert(!pidref_is_set(&m->control_pid)); - mount_enter_dead(m, MOUNT_SUCCESS); + mount_enter_dead(m, MOUNT_SUCCESS, /* flush_result = */ false); break; default: break; @@ -952,10 +952,10 @@ static int mount_spawn(Mount *m, ExecCommand *c, PidRef *ret_pid) { return 0; } -static void mount_enter_dead(Mount *m, MountResult f) { +static void mount_enter_dead(Mount *m, MountResult f, bool flush_result) { assert(m); - if (m->result == MOUNT_SUCCESS) + if (m->result == MOUNT_SUCCESS || flush_result) m->result = f; unit_log_result(UNIT(m), m->result == MOUNT_SUCCESS, mount_result_to_string(m->result)); @@ -983,17 +983,20 @@ static void mount_enter_mounted(Mount *m, MountResult f) { mount_set_state(m, MOUNT_MOUNTED); } -static void mount_enter_dead_or_mounted(Mount *m, MountResult f) { +static void mount_enter_dead_or_mounted(Mount *m, MountResult f, bool flush_result) { assert(m); - /* Enter DEAD or MOUNTED state, depending on what the kernel currently says about the mount point. We use this - * whenever we executed an operation, so that our internal state reflects what the kernel says again, after all - * ultimately we just mirror the kernel's internal state on this. */ + /* Enter DEAD or MOUNTED state, depending on what the kernel currently says about the mount point. + * We use this whenever we executed an operation, so that our internal state reflects what + * the kernel says again, after all ultimately we just mirror the kernel's internal state on this. + * + * Note that flush_result only applies to mount_enter_dead(), since that's when the result gets + * turned into unit end state. */ if (m->from_proc_self_mountinfo) mount_enter_mounted(m, f); else - mount_enter_dead(m, f); + mount_enter_dead(m, f, flush_result); } static int state_to_kill_operation(MountState state) { @@ -1049,12 +1052,12 @@ static void mount_enter_signal(Mount *m, MountState state, MountResult f) { else if (state == MOUNT_UNMOUNTING_SIGTERM && m->kill_context.send_sigkill) mount_enter_signal(m, MOUNT_UNMOUNTING_SIGKILL, MOUNT_SUCCESS); else - mount_enter_dead_or_mounted(m, MOUNT_SUCCESS); + mount_enter_dead_or_mounted(m, MOUNT_SUCCESS, /* flush_result = */ false); return; fail: - mount_enter_dead_or_mounted(m, MOUNT_FAILURE_RESOURCES); + mount_enter_dead_or_mounted(m, MOUNT_FAILURE_RESOURCES, /* flush_result = */ false); } static int mount_set_umount_command(Mount *m, ExecCommand *c) { @@ -1116,7 +1119,7 @@ static void mount_enter_unmounting(Mount *m) { return; fail: - mount_enter_dead_or_mounted(m, MOUNT_FAILURE_RESOURCES); + mount_enter_dead_or_mounted(m, MOUNT_FAILURE_RESOURCES, /* flush_result = */ false); } static int mount_set_mount_command(Mount *m, ExecCommand *c, const MountParameters *p) { @@ -1232,7 +1235,7 @@ static void mount_enter_mounting(Mount *m) { return; fail: - mount_enter_dead_or_mounted(m, MOUNT_FAILURE_RESOURCES); + mount_enter_dead_or_mounted(m, MOUNT_FAILURE_RESOURCES, /* flush_result = */ false); } static void mount_set_reload_result(Mount *m, MountResult result) { @@ -1298,7 +1301,7 @@ static void mount_enter_remounting(Mount *m) { fail: mount_set_reload_result(m, MOUNT_FAILURE_RESOURCES); - mount_enter_dead_or_mounted(m, MOUNT_SUCCESS); + mount_enter_dead_or_mounted(m, MOUNT_SUCCESS, /* flush_result = */ false); } static void mount_cycle_clear(Mount *m) { @@ -1472,8 +1475,8 @@ static int mount_deserialize_item(Unit *u, const char *key, const char *value, F } else if (streq(key, "control-pid")) { - pidref_done(&m->control_pid); - (void) deserialize_pidref(fds, value, &m->control_pid); + if (!pidref_is_set(&m->control_pid)) + (void) deserialize_pidref(fds, value, &m->control_pid); } else if (streq(key, "control-command")) { MountExecCommand id; @@ -1555,7 +1558,8 @@ static void mount_sigchld_event(Unit *u, pid_t pid, int code, int status) { if (IN_SET(m->state, MOUNT_REMOUNTING, MOUNT_REMOUNTING_SIGKILL, MOUNT_REMOUNTING_SIGTERM)) mount_set_reload_result(m, f); - else if (m->result == MOUNT_SUCCESS) + else if (m->result == MOUNT_SUCCESS && !IN_SET(m->state, MOUNT_MOUNTING, MOUNT_UNMOUNTING)) + /* MOUNT_MOUNTING and MOUNT_UNMOUNTING states need to be patched, see below. */ m->result = f; if (m->control_command) { @@ -1578,15 +1582,15 @@ static void mount_sigchld_event(Unit *u, pid_t pid, int code, int status) { switch (m->state) { case MOUNT_MOUNTING: - /* Our mount point has not appeared in mountinfo. Something went wrong. */ + /* Our mount point has not appeared in mountinfo. Something went wrong. */ if (f == MOUNT_SUCCESS) { - /* Either /bin/mount has an unexpected definition of success, - * or someone raced us and we lost. */ + /* Either /bin/mount has an unexpected definition of success, or someone raced us + * and we lost. */ log_unit_warning(UNIT(m), "Mount process finished, but there is no mount."); f = MOUNT_FAILURE_PROTOCOL; } - mount_enter_dead(m, f); + mount_enter_dead(m, f, /* flush_result = */ false); break; case MOUNT_MOUNTING_DONE: @@ -1596,13 +1600,11 @@ static void mount_sigchld_event(Unit *u, pid_t pid, int code, int status) { case MOUNT_REMOUNTING: case MOUNT_REMOUNTING_SIGTERM: case MOUNT_REMOUNTING_SIGKILL: - mount_enter_dead_or_mounted(m, MOUNT_SUCCESS); + mount_enter_dead_or_mounted(m, MOUNT_SUCCESS, /* flush_result = */ false); break; case MOUNT_UNMOUNTING: - if (f == MOUNT_SUCCESS && m->from_proc_self_mountinfo) { - /* Still a mount point? If so, let's try again. Most likely there were multiple mount points * stacked on top of each other. We might exceed the timeout specified by the user overall, * but we will stop as soon as any one umount times out. */ @@ -1613,23 +1615,33 @@ static void mount_sigchld_event(Unit *u, pid_t pid, int code, int status) { mount_enter_unmounting(m); } else { log_unit_warning(u, "Mount still present after %u attempts to unmount, giving up.", m->n_retry_umount); - mount_enter_mounted(m, f); + mount_enter_mounted(m, MOUNT_FAILURE_PROTOCOL); } + } else if (f == MOUNT_FAILURE_EXIT_CODE && !m->from_proc_self_mountinfo) { + /* Hmm, umount process spawned by us failed, but the mount disappeared anyway? + * Maybe someone else is trying to unmount at the same time. */ + log_unit_notice(u, "Mount disappeared even though umount process failed, continuing."); + mount_enter_dead(m, MOUNT_SUCCESS, /* flush_result = */ true); } else - mount_enter_dead_or_mounted(m, f); + /* At this point, either the unmount succeeded or unexpected error occurred. We usually + * remember the first error in 'result', but here let's update that forcibly, since + * there could previous failed attempts yet we only care about the most recent + * attempt. IOW, if we eventually managed to unmount the stuff, don't enter failed + * end state. */ + mount_enter_dead_or_mounted(m, f, /* flush_result = */ true); break; - case MOUNT_UNMOUNTING_SIGKILL: case MOUNT_UNMOUNTING_SIGTERM: - mount_enter_dead_or_mounted(m, f); + case MOUNT_UNMOUNTING_SIGKILL: + mount_enter_dead_or_mounted(m, f, /* flush_result = */ false); break; case MOUNT_CLEANING: if (m->clean_result == MOUNT_SUCCESS) m->clean_result = f; - mount_enter_dead(m, MOUNT_SUCCESS); + mount_enter_dead(m, MOUNT_SUCCESS, /* flush_result = */ false); break; default: @@ -1668,7 +1680,7 @@ static int mount_dispatch_timer(sd_event_source *source, usec_t usec, void *user mount_enter_signal(m, MOUNT_REMOUNTING_SIGKILL, MOUNT_SUCCESS); } else { log_unit_warning(UNIT(m), "Remounting timed out. Skipping SIGKILL. Ignoring."); - mount_enter_dead_or_mounted(m, MOUNT_SUCCESS); + mount_enter_dead_or_mounted(m, MOUNT_SUCCESS, /* flush_result = */ false); } break; @@ -1676,7 +1688,7 @@ static int mount_dispatch_timer(sd_event_source *source, usec_t usec, void *user mount_set_reload_result(m, MOUNT_FAILURE_TIMEOUT); log_unit_warning(UNIT(m), "Mount process still around after SIGKILL. Ignoring."); - mount_enter_dead_or_mounted(m, MOUNT_SUCCESS); + mount_enter_dead_or_mounted(m, MOUNT_SUCCESS, /* flush_result = */ false); break; case MOUNT_UNMOUNTING: @@ -1690,13 +1702,13 @@ static int mount_dispatch_timer(sd_event_source *source, usec_t usec, void *user mount_enter_signal(m, MOUNT_UNMOUNTING_SIGKILL, MOUNT_FAILURE_TIMEOUT); } else { log_unit_warning(UNIT(m), "Mount process timed out. Skipping SIGKILL. Ignoring."); - mount_enter_dead_or_mounted(m, MOUNT_FAILURE_TIMEOUT); + mount_enter_dead_or_mounted(m, MOUNT_FAILURE_TIMEOUT, /* flush_result = */ false); } break; case MOUNT_UNMOUNTING_SIGKILL: log_unit_warning(UNIT(m), "Mount process still around after SIGKILL. Ignoring."); - mount_enter_dead_or_mounted(m, MOUNT_FAILURE_TIMEOUT); + mount_enter_dead_or_mounted(m, MOUNT_FAILURE_TIMEOUT, /* flush_result = */ false); break; case MOUNT_CLEANING: @@ -2157,8 +2169,11 @@ static int mount_process_proc_self_mountinfo(Manager *m) { switch (mount->state) { case MOUNT_MOUNTED: - /* This has just been unmounted by somebody else, follow the state change. */ - mount_enter_dead(mount, MOUNT_SUCCESS); + /* This has just been unmounted by somebody else, follow the state change. + * Also explicitly override the result (see the comment in mount_sigchld_event()), + * but more aggressively here since the state change is extrinsic. */ + mount_cycle_clear(mount); + mount_enter_dead(mount, MOUNT_SUCCESS, /* flush_result = */ true); break; case MOUNT_MOUNTING_DONE: @@ -2166,7 +2181,7 @@ static int mount_process_proc_self_mountinfo(Manager *m) { * then remove it because of an internal error. E.g., fuse.sshfs seems * to do that when the connection fails. See #17617. To handle such the * case, let's once set the state back to mounting. Then, the unit can - * correctly enter the failed state later in mount_sigchld(). */ + * correctly enter the failed state later in mount_sigchld_event(). */ mount_set_state(mount, MOUNT_MOUNTING); break; @@ -2330,7 +2345,7 @@ static int mount_can_start(Unit *u) { r = unit_test_start_limit(u); if (r < 0) { - mount_enter_dead(m, MOUNT_FAILURE_START_LIMIT_HIT); + mount_enter_dead(m, MOUNT_FAILURE_START_LIMIT_HIT, /* flush_result = */ false); return r; } diff --git a/src/core/scope.c b/src/core/scope.c index e4c27da..2841280 100644 --- a/src/core/scope.c +++ b/src/core/scope.c @@ -586,6 +586,8 @@ static int scope_deserialize_item(Unit *u, const char *key, const char *value, F } else if (streq(key, "pids")) { _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + /* We don't check if we already received the pid before here because unit_watch_pidref() + * does this check internally and discards the new pidref if we already received it before. */ if (deserialize_pidref(fds, value, &pidref) >= 0) { r = unit_watch_pidref(u, &pidref, /* exclusive= */ false); if (r < 0) diff --git a/src/core/service.c b/src/core/service.c index 060ac08..ffe92d2 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -3174,14 +3174,14 @@ static int service_deserialize_item(Unit *u, const char *key, const char *value, s->reload_result = f; } else if (streq(key, "control-pid")) { - pidref_done(&s->control_pid); - (void) deserialize_pidref(fds, value, &s->control_pid); + if (!pidref_is_set(&s->control_pid)) + (void) deserialize_pidref(fds, value, &s->control_pid); } else if (streq(key, "main-pid")) { _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; - if (deserialize_pidref(fds, value, &pidref) >= 0) + if (!pidref_is_set(&s->main_pid) && deserialize_pidref(fds, value, &pidref) >= 0) (void) service_set_main_pidref(s, &pidref); } else if (streq(key, "main-pid-known")) { @@ -3589,8 +3589,10 @@ static void service_notify_cgroup_empty_event(Unit *u) { break; } - if (s->exit_type == SERVICE_EXIT_CGROUP && main_pid_good(s) <= 0) - service_enter_start_post(s); + if (s->exit_type == SERVICE_EXIT_CGROUP && main_pid_good(s) <= 0) { + service_enter_stop_post(s, SERVICE_SUCCESS); + break; + } _fallthrough_; case SERVICE_START_POST: @@ -3861,11 +3863,13 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) { default: assert_not_reached(); } - } else if (s->exit_type == SERVICE_EXIT_CGROUP && s->state == SERVICE_START) + } else if (s->exit_type == SERVICE_EXIT_CGROUP && s->state == SERVICE_START && + !IN_SET(s->type, SERVICE_NOTIFY, SERVICE_NOTIFY_RELOAD, SERVICE_DBUS)) /* If a main process exits very quickly, this function might be executed * before service_dispatch_exec_io(). Since this function disabled IO events * to monitor the main process above, we need to update the state here too. - * Let's consider the process is successfully launched and exited. */ + * Let's consider the process is successfully launched and exited, but + * only when we're not expecting a readiness notification or dbus name. */ service_enter_start_post(s); } diff --git a/src/core/show-status.c b/src/core/show-status.c index 606237e..5b003ba 100644 --- a/src/core/show-status.c +++ b/src/core/show-status.c @@ -38,6 +38,8 @@ int parse_show_status(const char *v, ShowStatus *ret) { int status_vprintf(const char *status, ShowStatusFlags flags, const char *format, va_list ap) { static const char status_indent[] = " "; /* "[" STATUS "] " */ + static int dumb = -1; + _cleanup_free_ char *s = NULL; _cleanup_close_ int fd = -EBADF; struct iovec iovec[7] = {}; @@ -46,6 +48,9 @@ int status_vprintf(const char *status, ShowStatusFlags flags, const char *format assert(format); + if (dumb < 0) + dumb = getenv_terminal_is_dumb(); + /* This is independent of logging, as status messages are * optional and go exclusively to the console. */ @@ -61,7 +66,7 @@ int status_vprintf(const char *status, ShowStatusFlags flags, const char *format if (fd < 0) return fd; - if (FLAGS_SET(flags, SHOW_STATUS_ELLIPSIZE)) { + if (FLAGS_SET(flags, SHOW_STATUS_ELLIPSIZE) && !dumb) { char *e; size_t emax, sl; int c; @@ -81,7 +86,7 @@ int status_vprintf(const char *status, ShowStatusFlags flags, const char *format free_and_replace(s, e); } - if (prev_ephemeral) + if (prev_ephemeral && !dumb) iovec[n++] = IOVEC_MAKE_STRING(ANSI_REVERSE_LINEFEED "\r" ANSI_ERASE_TO_END_OF_LINE); if (status) { @@ -94,9 +99,11 @@ int status_vprintf(const char *status, ShowStatusFlags flags, const char *format } iovec[n++] = IOVEC_MAKE_STRING(s); - iovec[n++] = IOVEC_MAKE_STRING("\r\n"); /* use CRNL instead of just NL, to be robust towards TTYs in raw mode */ + /* use CRNL instead of just NL, to be robust towards TTYs in raw mode. If we're writing to a dumb + * terminal, use NL as CRNL might be interpreted as a double newline. */ + iovec[n++] = IOVEC_MAKE_STRING(dumb ? "\n" : "\r\n"); - if (prev_ephemeral && !FLAGS_SET(flags, SHOW_STATUS_EPHEMERAL)) + if (prev_ephemeral && !FLAGS_SET(flags, SHOW_STATUS_EPHEMERAL) && !dumb) iovec[n++] = IOVEC_MAKE_STRING(ANSI_ERASE_TO_END_OF_LINE); prev_ephemeral = FLAGS_SET(flags, SHOW_STATUS_EPHEMERAL); diff --git a/src/core/socket.c b/src/core/socket.c index 388be62..9adae16 100644 --- a/src/core/socket.c +++ b/src/core/socket.c @@ -2634,8 +2634,9 @@ static int socket_deserialize_item(Unit *u, const char *key, const char *value, else s->n_refused += k; } else if (streq(key, "control-pid")) { - pidref_done(&s->control_pid); - (void) deserialize_pidref(fds, value, &s->control_pid); + + if (!pidref_is_set(&s->control_pid)) + (void) deserialize_pidref(fds, value, &s->control_pid); } else if (streq(key, "control-command")) { SocketExecCommand id; diff --git a/src/core/swap.c b/src/core/swap.c index 488b171..682c2b9 100644 --- a/src/core/swap.c +++ b/src/core/swap.c @@ -989,8 +989,8 @@ static int swap_deserialize_item(Unit *u, const char *key, const char *value, FD s->result = f; } else if (streq(key, "control-pid")) { - pidref_done(&s->control_pid); - (void) deserialize_pidref(fds, value, &s->control_pid); + if (!pidref_is_set(&s->control_pid)) + (void) deserialize_pidref(fds, value, &s->control_pid); } else if (streq(key, "control-command")) { SwapExecCommand id; diff --git a/src/cryptenroll/cryptenroll-tpm2.c b/src/cryptenroll/cryptenroll-tpm2.c index 653ad44..2d93e13 100644 --- a/src/cryptenroll/cryptenroll-tpm2.c +++ b/src/cryptenroll/cryptenroll-tpm2.c @@ -239,9 +239,9 @@ int enroll_tpm2(struct crypt_device *cd, return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Must provide all PCR values when using TPM2 device key."); } else { - r = tpm2_context_new(device, &tpm2_context); + r = tpm2_context_new_or_warn(device, &tpm2_context); if (r < 0) - return log_error_errno(r, "Failed to create TPM2 context: %m"); + return r; if (!tpm2_pcr_values_has_all_values(hash_pcr_values, n_hash_pcr_values)) { r = tpm2_pcr_read_missing_values(tpm2_context, hash_pcr_values, n_hash_pcr_values); diff --git a/src/cryptsetup/cryptsetup-tokens/luks2-tpm2.c b/src/cryptsetup/cryptsetup-tokens/luks2-tpm2.c index 72be5cc..846679f 100644 --- a/src/cryptsetup/cryptsetup-tokens/luks2-tpm2.c +++ b/src/cryptsetup/cryptsetup-tokens/luks2-tpm2.c @@ -85,9 +85,9 @@ int acquire_luks2_key( } _cleanup_(tpm2_context_unrefp) Tpm2Context *tpm2_context = NULL; - r = tpm2_context_new(device, &tpm2_context); + r = tpm2_context_new_or_warn(device, &tpm2_context); if (r < 0) - return log_error_errno(r, "Failed to create TPM2 context: %m"); + return r; r = tpm2_unseal(tpm2_context, hash_pcr_mask, diff --git a/src/cryptsetup/cryptsetup-tokens/luks2-tpm2.h b/src/cryptsetup/cryptsetup-tokens/luks2-tpm2.h index d84e5a3..8408bab 100644 --- a/src/cryptsetup/cryptsetup-tokens/luks2-tpm2.h +++ b/src/cryptsetup/cryptsetup-tokens/luks2-tpm2.h @@ -14,8 +14,8 @@ int acquire_luks2_key( size_t pubkey_size, uint32_t pubkey_pcr_mask, const char *signature_path, - const char *pcrlock_path, const char *pin, + const char *pcrlock_path, uint16_t primary_alg, const void *key_data, size_t key_data_size, diff --git a/src/cryptsetup/cryptsetup-tpm2.c b/src/cryptsetup/cryptsetup-tpm2.c index f59d5f9..e7a38d4 100644 --- a/src/cryptsetup/cryptsetup-tpm2.c +++ b/src/cryptsetup/cryptsetup-tpm2.c @@ -139,9 +139,9 @@ int acquire_tpm2_key( } _cleanup_(tpm2_context_unrefp) Tpm2Context *tpm2_context = NULL; - r = tpm2_context_new(device, &tpm2_context); + r = tpm2_context_new_or_warn(device, &tpm2_context); if (r < 0) - return log_error_errno(r, "Failed to create TPM2 context: %m"); + return r; if (!(flags & TPM2_FLAGS_USE_PIN)) { r = tpm2_unseal(tpm2_context, diff --git a/src/cryptsetup/cryptsetup.c b/src/cryptsetup/cryptsetup.c index b56b51a..1822beb 100644 --- a/src/cryptsetup/cryptsetup.c +++ b/src/cryptsetup/cryptsetup.c @@ -846,9 +846,9 @@ static int measure_volume_key( #if HAVE_TPM2 _cleanup_(tpm2_context_unrefp) Tpm2Context *c = NULL; - r = tpm2_context_new(arg_tpm2_device, &c); + r = tpm2_context_new_or_warn(arg_tpm2_device, &c); if (r < 0) - return log_error_errno(r, "Failed to create TPM2 context: %m"); + return r; _cleanup_strv_free_ char **l = NULL; if (strv_isempty(arg_tpm2_measure_banks)) { diff --git a/src/dissect/dissect.c b/src/dissect/dissect.c index 92432b6..c858e6a 100644 --- a/src/dissect/dissect.c +++ b/src/dissect/dissect.c @@ -85,7 +85,7 @@ static bool arg_rmdir = false; static bool arg_in_memory = false; static char **arg_argv = NULL; static char *arg_loop_ref = NULL; -static ImagePolicy* arg_image_policy = NULL; +static ImagePolicy *arg_image_policy = NULL; static bool arg_mtree_hash = true; STATIC_DESTRUCTOR_REGISTER(arg_image, freep); @@ -94,6 +94,7 @@ STATIC_DESTRUCTOR_REGISTER(arg_path, freep); STATIC_DESTRUCTOR_REGISTER(arg_verity_settings, verity_settings_done); STATIC_DESTRUCTOR_REGISTER(arg_argv, strv_freep); STATIC_DESTRUCTOR_REGISTER(arg_loop_ref, freep); +STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep); static int help(void) { _cleanup_free_ char *link = NULL; diff --git a/src/home/homed-manager.c b/src/home/homed-manager.c index c452531..b8bef53 100644 --- a/src/home/homed-manager.c +++ b/src/home/homed-manager.c @@ -1040,7 +1040,7 @@ static int manager_bind_varlink(Manager *m) { assert(!m->userdb_service); r = path_extract_filename(socket_path, &m->userdb_service); if (r < 0) - return log_error_errno(r, "Failed to extra filename from socket path '%s': %m", socket_path); + return log_error_errno(r, "Failed to extract filename from socket path '%s': %m", socket_path); /* Avoid recursion */ if (setenv("SYSTEMD_BYPASS_USERDB", m->userdb_service, 1) < 0) diff --git a/src/home/homework-cifs.c b/src/home/homework-cifs.c index 19f1cd5..5d87131 100644 --- a/src/home/homework-cifs.c +++ b/src/home/homework-cifs.c @@ -5,6 +5,7 @@ #include #endif +#include "data-fd-util.h" #include "dirent-util.h" #include "fd-util.h" #include "fileio.h" @@ -24,7 +25,7 @@ int home_setup_cifs( HomeSetupFlags flags, HomeSetup *setup) { - _cleanup_free_ char *chost = NULL, *cservice = NULL, *cdir = NULL, *chost_and_service = NULL, *j = NULL; + _cleanup_free_ char *chost = NULL, *cservice = NULL, *cdir = NULL, *chost_and_service = NULL, *j = NULL, *options = NULL; int r; assert(h); @@ -53,49 +54,50 @@ int home_setup_cifs( if (!chost_and_service) return log_oom(); + if (asprintf(&options, "user=%s,uid=" UID_FMT ",forceuid,gid=" GID_FMT ",forcegid,file_mode=0%3o,dir_mode=0%3o", + user_record_cifs_user_name(h), h->uid, user_record_gid(h), user_record_access_mode(h), + user_record_access_mode(h)) < 0) + return log_oom(); + + if (h->cifs_domain) + if (strextendf_with_separator(&options, ",", "domain=%s", h->cifs_domain) < 0) + return log_oom(); + + if (h->cifs_extra_mount_options) + if (!strextend_with_separator(&options, ",", h->cifs_extra_mount_options)) + return log_oom(); + r = home_unshare_and_mkdir(); if (r < 0) return r; STRV_FOREACH(pw, h->password) { - _cleanup_(unlink_and_freep) char *p = NULL; - _cleanup_free_ char *options = NULL; - _cleanup_fclose_ FILE *f = NULL; + _cleanup_close_ int passwd_fd = -EBADF; pid_t mount_pid; int exit_status; - r = fopen_temporary_child(NULL, &f, &p); - if (r < 0) - return log_error_errno(r, "Failed to create temporary credentials file: %m"); - - fprintf(f, - "username=%s\n" - "password=%s\n", - user_record_cifs_user_name(h), - *pw); - - if (h->cifs_domain) - fprintf(f, "domain=%s\n", h->cifs_domain); - - r = fflush_and_check(f); - if (r < 0) - return log_error_errno(r, "Failed to write temporary credentials file: %m"); - - f = safe_fclose(f); - - if (asprintf(&options, "credentials=%s,uid=" UID_FMT ",forceuid,gid=" GID_FMT ",forcegid,file_mode=0%3o,dir_mode=0%3o", - p, h->uid, user_record_gid(h), user_record_access_mode(h), user_record_access_mode(h)) < 0) - return log_oom(); - - if (h->cifs_extra_mount_options) - if (!strextend_with_separator(&options, ",", h->cifs_extra_mount_options)) - return log_oom(); + passwd_fd = acquire_data_fd(*pw, strlen(*pw), /* flags= */ 0); + if (passwd_fd < 0) + return log_error_errno(passwd_fd, "Failed to create data FD for password: %m"); r = safe_fork("(mount)", FORK_RESET_SIGNALS|FORK_RLIMIT_NOFILE_SAFE|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_STDOUT_TO_STDERR, &mount_pid); if (r < 0) return r; if (r == 0) { /* Child */ + + r = fd_cloexec(passwd_fd, false); + if (r < 0) { + log_error_errno(r, "Failed to disable CLOEXEC on password FD: %m"); + _exit(EXIT_FAILURE); + } + + r = setenvf("PASSWD_FD", /* overwrite= */ true, "%d", passwd_fd); + if (r < 0) { + log_error_errno(errno, "Failed to set $PASSWD_FD: %m"); + _exit(EXIT_FAILURE); + } + execl("/bin/mount", "/bin/mount", "-n", "-t", "cifs", chost_and_service, HOME_RUNTIME_WORK_DIR, "-o", options, NULL); diff --git a/src/journal-remote/journal-gatewayd.c b/src/journal-remote/journal-gatewayd.c index 0919471..aaa52c0 100644 --- a/src/journal-remote/journal-gatewayd.c +++ b/src/journal-remote/journal-gatewayd.c @@ -47,6 +47,7 @@ static char **arg_file = NULL; STATIC_DESTRUCTOR_REGISTER(arg_key_pem, erase_and_freep); STATIC_DESTRUCTOR_REGISTER(arg_cert_pem, freep); STATIC_DESTRUCTOR_REGISTER(arg_trust_pem, freep); +STATIC_DESTRUCTOR_REGISTER(arg_file, strv_freep); typedef struct RequestMeta { sd_journal *journal; diff --git a/src/journal-remote/journal-remote-main.c b/src/journal-remote/journal-remote-main.c index da0f20d..294719b 100644 --- a/src/journal-remote/journal-remote-main.c +++ b/src/journal-remote/journal-remote-main.c @@ -535,24 +535,6 @@ static int dispatch_http_event(sd_event_source *event, ********************************************************************** **********************************************************************/ -static int setup_signals(RemoteServer *s) { - int r; - - assert(s); - - assert_se(sigprocmask_many(SIG_SETMASK, NULL, SIGINT, SIGTERM, -1) >= 0); - - r = sd_event_add_signal(s->events, &s->sigterm_event, SIGTERM, NULL, s); - if (r < 0) - return r; - - r = sd_event_add_signal(s->events, &s->sigint_event, SIGINT, NULL, s); - if (r < 0) - return r; - - return 0; -} - static int setup_raw_socket(RemoteServer *s, const char *address) { int fd; @@ -580,9 +562,9 @@ static int create_remoteserver( if (r < 0) return r; - r = setup_signals(s); + r = sd_event_set_signal_exit(s->events, true); if (r < 0) - return log_error_errno(r, "Failed to set up signals: %m"); + return log_error_errno(r, "Failed to install SIGINT/SIGTERM handlers: %m"); n = sd_listen_fds(true); if (n < 0) diff --git a/src/journal-remote/journal-remote.c b/src/journal-remote/journal-remote.c index 79010d0..9db686d 100644 --- a/src/journal-remote/journal-remote.c +++ b/src/journal-remote/journal-remote.c @@ -376,8 +376,6 @@ void journal_remote_server_destroy(RemoteServer *s) { writer_unref(s->_single_writer); hashmap_free(s->writers); - sd_event_source_unref(s->sigterm_event); - sd_event_source_unref(s->sigint_event); sd_event_source_unref(s->listen_event); sd_event_unref(s->events); @@ -517,7 +515,9 @@ static int accept_connection( switch (socket_address_family(addr)) { case AF_INET: - case AF_INET6: { + case AF_INET6: + case AF_VSOCK: + case AF_UNIX: { _cleanup_free_ char *a = NULL; char *b; diff --git a/src/journal-remote/journal-remote.h b/src/journal-remote/journal-remote.h index 8d73f95..3d64db0 100644 --- a/src/journal-remote/journal-remote.h +++ b/src/journal-remote/journal-remote.h @@ -30,7 +30,7 @@ struct RemoteServer { size_t active; sd_event *events; - sd_event_source *sigterm_event, *sigint_event, *listen_event; + sd_event_source *listen_event; Hashmap *writers; Writer *_single_writer; diff --git a/src/journal-remote/journal-upload.c b/src/journal-remote/journal-upload.c index db74355..d70a049 100644 --- a/src/journal-remote/journal-upload.c +++ b/src/journal-remote/journal-upload.c @@ -59,6 +59,8 @@ static int arg_follow = -1; static const char *arg_save_state = NULL; static usec_t arg_network_timeout_usec = USEC_INFINITY; +STATIC_DESTRUCTOR_REGISTER(arg_file, strv_freep); + static void close_fd_input(Uploader *u); #define SERVER_ANSWER_KEEP 2048 @@ -378,38 +380,6 @@ static int open_file_for_upload(Uploader *u, const char *filename) { return r; } -static int dispatch_sigterm(sd_event_source *event, - const struct signalfd_siginfo *si, - void *userdata) { - Uploader *u = ASSERT_PTR(userdata); - - log_received_signal(LOG_INFO, si); - - close_fd_input(u); - close_journal_input(u); - - sd_event_exit(u->events, 0); - return 0; -} - -static int setup_signals(Uploader *u) { - int r; - - assert(u); - - assert_se(sigprocmask_many(SIG_SETMASK, NULL, SIGINT, SIGTERM, -1) >= 0); - - r = sd_event_add_signal(u->events, &u->sigterm_event, SIGTERM, dispatch_sigterm, u); - if (r < 0) - return r; - - r = sd_event_add_signal(u->events, &u->sigint_event, SIGINT, dispatch_sigterm, u); - if (r < 0) - return r; - - return 0; -} - static int setup_uploader(Uploader *u, const char *url, const char *state_file) { int r; const char *host, *proto = ""; @@ -449,9 +419,9 @@ static int setup_uploader(Uploader *u, const char *url, const char *state_file) if (r < 0) return log_error_errno(r, "sd_event_default failed: %m"); - r = setup_signals(u); + r = sd_event_set_signal_exit(u->events, true); if (r < 0) - return log_error_errno(r, "Failed to set up signals: %m"); + return log_error_errno(r, "Failed to install SIGINT/SIGTERM handlers: %m"); (void) sd_watchdog_enabled(false, &u->watchdog_usec); @@ -475,8 +445,6 @@ static void destroy_uploader(Uploader *u) { close_fd_input(u); close_journal_input(u); - sd_event_source_unref(u->sigterm_event); - sd_event_source_unref(u->sigint_event); sd_event_unref(u->events); } diff --git a/src/journal-remote/journal-upload.h b/src/journal-remote/journal-upload.h index 9ff5a7b..2007864 100644 --- a/src/journal-remote/journal-upload.h +++ b/src/journal-remote/journal-upload.h @@ -25,7 +25,6 @@ typedef enum { typedef struct Uploader { sd_event *events; - sd_event_source *sigint_event, *sigterm_event; char *url; CURL *easy; diff --git a/src/journal/cat.c b/src/journal/cat.c index 609ddba..0325add 100644 --- a/src/journal/cat.c +++ b/src/journal/cat.c @@ -12,6 +12,7 @@ #include "alloc-util.h" #include "build.h" +#include "env-util.h" #include "fd-util.h" #include "format-util.h" #include "main-func.h" @@ -157,7 +158,6 @@ static int run(int argc, char *argv[]) { if (argc <= optind) (void) execl("/bin/cat", "/bin/cat", NULL); else { - _cleanup_free_ char *s = NULL; struct stat st; if (fstat(STDERR_FILENO, &st) < 0) @@ -165,11 +165,9 @@ static int run(int argc, char *argv[]) { "Failed to fstat(%s): %m", FORMAT_PROC_FD_PATH(STDERR_FILENO)); - if (asprintf(&s, DEV_FMT ":" INO_FMT, (dev_t)st.st_dev, st.st_ino) < 0) - return log_oom(); - - if (setenv("JOURNAL_STREAM", s, /* overwrite = */ true) < 0) - return log_error_errno(errno, "Failed to set environment variable JOURNAL_STREAM: %m"); + r = setenvf("JOURNAL_STREAM", /* overwrite = */ true, DEV_FMT ":" INO_FMT, (dev_t) st.st_dev, st.st_ino); + if (r < 0) + return log_error_errno(r, "Failed to set environment variable JOURNAL_STREAM: %m"); (void) execvp(argv[optind], argv + optind); } diff --git a/src/journal/journalctl.c b/src/journal/journalctl.c index 7f3dcd5..45ecc96 100644 --- a/src/journal/journalctl.c +++ b/src/journal/journalctl.c @@ -380,7 +380,7 @@ static int help(void) { " -u --unit=UNIT Show logs from the specified unit\n" " --user-unit=UNIT Show logs from the specified user unit\n" " -t --identifier=STRING Show entries with the specified syslog identifier\n" - " -p --priority=RANGE Show entries with the specified priority\n" + " -p --priority=RANGE Show entries within the specified priority range\n" " --facility=FACILITY... Show entries with the specified facilities\n" " -g --grep=PATTERN Show entries with MESSAGE matching PATTERN\n" " --case-sensitive[=BOOL] Force case sensitive or insensitive matching\n" @@ -1938,6 +1938,7 @@ static int update_cursor(sd_journal *j) { typedef struct Context { sd_journal *journal; + bool has_cursor; bool need_seek; bool since_seeked; bool ellipsized; @@ -1967,11 +1968,11 @@ static int show(Context *c) { break; } - if (arg_until_set && !arg_reverse && (arg_lines < 0 || arg_since_set)) { - /* If --lines= is set, we usually rely on the n_shown to tell us - * when to stop. However, if --since= is set too, we may end up - * having less than --lines= to output. In this case let's also - * check if the entry is in range. */ + if (arg_until_set && !arg_reverse && (arg_lines < 0 || arg_since_set || c->has_cursor)) { + /* If --lines= is set, we usually rely on the n_shown to tell us when to stop. + * However, if --since= or one of the cursor argument is set too, we may end up + * having less than --lines= to output. In this case let's also check if the entry + * is in range. */ usec_t usec; @@ -2572,6 +2573,7 @@ static int run(int argc, char *argv[]) { Context c = { .journal = j, + .has_cursor = cursor, .need_seek = need_seek, .since_seeked = since_seeked, }; diff --git a/src/kernel-install/60-ukify.install.in b/src/kernel-install/60-ukify.install.in index be1e21b..0f7a0db 100755 --- a/src/kernel-install/60-ukify.install.in +++ b/src/kernel-install/60-ukify.install.in @@ -109,6 +109,12 @@ def parse_args(args=None): return opts def we_are_wanted() -> bool: + KERNEL_INSTALL_IMAGE_TYPE = os.getenv('KERNEL_INSTALL_IMAGE_TYPE') + + if KERNEL_INSTALL_IMAGE_TYPE == 'uki': + log('The image being installed is already a UKI, quitting.') + return False + KERNEL_INSTALL_LAYOUT = os.getenv('KERNEL_INSTALL_LAYOUT') if KERNEL_INSTALL_LAYOUT != 'uki': diff --git a/src/kernel-install/90-uki-copy.install b/src/kernel-install/90-uki-copy.install index c66c097..d443c4b 100755 --- a/src/kernel-install/90-uki-copy.install +++ b/src/kernel-install/90-uki-copy.install @@ -26,8 +26,6 @@ KERNEL_VERSION="${2:?}" ENTRY_DIR_ABS="$3" KERNEL_IMAGE="$4" -[ "$KERNEL_INSTALL_LAYOUT" = "uki" ] || exit 0 - ENTRY_TOKEN="$KERNEL_INSTALL_ENTRY_TOKEN" BOOT_ROOT="$KERNEL_INSTALL_BOOT_ROOT" @@ -48,6 +46,8 @@ case "$COMMAND" in ;; esac +[ "$KERNEL_INSTALL_LAYOUT" = "uki" ] || exit 0 + if ! [ -d "$UKI_DIR" ]; then [ "$KERNEL_INSTALL_VERBOSE" -gt 0 ] && echo "creating $UKI_DIR" mkdir -p "$UKI_DIR" diff --git a/src/libsystemd-network/dhcp-option.c b/src/libsystemd-network/dhcp-option.c index 5e216c5..5679091 100644 --- a/src/libsystemd-network/dhcp-option.c +++ b/src/libsystemd-network/dhcp-option.c @@ -10,6 +10,8 @@ #include "alloc-util.h" #include "dhcp-option.h" #include "dhcp-server-internal.h" +#include "dns-domain.h" +#include "hostname-util.h" #include "memory-util.h" #include "ordered-set.h" #include "strv.h" @@ -396,27 +398,56 @@ int dhcp_option_parse(DHCPMessage *message, size_t len, dhcp_option_callback_t c } int dhcp_option_parse_string(const uint8_t *option, size_t len, char **ret) { + _cleanup_free_ char *string = NULL; int r; assert(option); assert(ret); - if (len <= 0) - *ret = mfree(*ret); - else { - char *string; + if (len <= 0) { + *ret = NULL; + return 0; + } - /* - * One trailing NUL byte is OK, we don't mind. See: - * https://github.com/systemd/systemd/issues/1337 - */ - r = make_cstring((const char *) option, len, MAKE_CSTRING_ALLOW_TRAILING_NUL, &string); - if (r < 0) - return r; + /* One trailing NUL byte is OK, we don't mind. See: + * https://github.com/systemd/systemd/issues/1337 */ + r = make_cstring((const char *) option, len, MAKE_CSTRING_ALLOW_TRAILING_NUL, &string); + if (r < 0) + return r; + + if (!string_is_safe(string) || !utf8_is_valid(string)) + return -EINVAL; + + *ret = TAKE_PTR(string); + return 0; +} + +int dhcp_option_parse_hostname(const uint8_t *option, size_t len, char **ret) { + _cleanup_free_ char *hostname = NULL; + int r; - free_and_replace(*ret, string); + assert(option); + assert(ret); + + r = dhcp_option_parse_string(option, len, &hostname); + if (r < 0) + return r; + + if (!hostname) { + *ret = NULL; + return 0; } + if (!hostname_is_valid(hostname, 0)) + return -EINVAL; + + r = dns_name_is_valid(hostname); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + *ret = TAKE_PTR(hostname); return 0; } diff --git a/src/libsystemd-network/dhcp-option.h b/src/libsystemd-network/dhcp-option.h index 425f5b5..aaa8f84 100644 --- a/src/libsystemd-network/dhcp-option.h +++ b/src/libsystemd-network/dhcp-option.h @@ -44,3 +44,4 @@ int dhcp_option_parse( char **ret_error_message); int dhcp_option_parse_string(const uint8_t *option, size_t len, char **ret); +int dhcp_option_parse_hostname(const uint8_t *option, size_t len, char **ret); diff --git a/src/libsystemd-network/sd-dhcp-lease.c b/src/libsystemd-network/sd-dhcp-lease.c index 4e3be98..202d75f 100644 --- a/src/libsystemd-network/sd-dhcp-lease.c +++ b/src/libsystemd-network/sd-dhcp-lease.c @@ -833,12 +833,16 @@ int dhcp_lease_parse_options(uint8_t code, uint8_t len, const void *option, void break; - case SD_DHCP_OPTION_ROOT_PATH: - r = dhcp_option_parse_string(option, len, &lease->root_path); + case SD_DHCP_OPTION_ROOT_PATH: { + _cleanup_free_ char *p = NULL; + + r = dhcp_option_parse_string(option, len, &p); if (r < 0) log_debug_errno(r, "Failed to parse root path, ignoring: %m"); - break; + free_and_replace(lease->root_path, p); + break; + } case SD_DHCP_OPTION_RENEWAL_TIME: r = lease_parse_be32_seconds(option, len, /* max_as_infinity = */ true, &lease->t1); if (r < 0) diff --git a/src/libsystemd-network/sd-dhcp-server.c b/src/libsystemd-network/sd-dhcp-server.c index fcc5b74..b87e4d6 100644 --- a/src/libsystemd-network/sd-dhcp-server.c +++ b/src/libsystemd-network/sd-dhcp-server.c @@ -808,14 +808,16 @@ static int parse_request(uint8_t code, uint8_t len, const void *option, void *us req->agent_info_option = (uint8_t*)option - 2; break; - case SD_DHCP_OPTION_HOST_NAME: - r = dhcp_option_parse_string(option, len, &req->hostname); - if (r < 0) { - log_debug_errno(r, "Failed to parse hostname, ignoring: %m"); - return 0; - } + case SD_DHCP_OPTION_HOST_NAME: { + _cleanup_free_ char *p = NULL; + r = dhcp_option_parse_hostname(option, len, &p); + if (r < 0) + log_debug_errno(r, "Failed to parse hostname, ignoring: %m"); + else + free_and_replace(req->hostname, p); break; + } case SD_DHCP_OPTION_PARAMETER_REQUEST_LIST: req->parameter_request_list = option; req->parameter_request_list_len = len; diff --git a/src/libsystemd/sd-bus/bus-error.c b/src/libsystemd/sd-bus/bus-error.c index 77b2e1a..f415797 100644 --- a/src/libsystemd/sd-bus/bus-error.c +++ b/src/libsystemd/sd-bus/bus-error.c @@ -277,14 +277,16 @@ _public_ int sd_bus_error_setf(sd_bus_error *e, const char *name, const char *fo va_start(ap, format); r = sd_bus_error_setfv(e, name, format, ap); - assert(!name || r < 0); + if (name) + assert(r < 0); va_end(ap); return r; } r = sd_bus_error_set(e, name, NULL); - assert(!name || r < 0); + if (name) + assert(r < 0); return r; } diff --git a/src/libsystemd/sd-device/device-private.h b/src/libsystemd/sd-device/device-private.h index b903d1a..e8a6d52 100644 --- a/src/libsystemd/sd-device/device-private.h +++ b/src/libsystemd/sd-device/device-private.h @@ -20,7 +20,10 @@ int device_opendir(sd_device *device, const char *subdir, DIR **ret); int device_get_property_bool(sd_device *device, const char *key); int device_get_property_int(sd_device *device, const char *key, int *ret); int device_get_sysattr_int(sd_device *device, const char *sysattr, int *ret_value); -int device_get_sysattr_unsigned(sd_device *device, const char *sysattr, unsigned *ret_value); +int device_get_sysattr_unsigned_full(sd_device *device, const char *sysattr, unsigned base, unsigned *ret_value); +static inline int device_get_sysattr_unsigned(sd_device *device, const char *sysattr, unsigned *ret_value) { + return device_get_sysattr_unsigned_full(device, sysattr, 0, ret_value); +} int device_get_sysattr_bool(sd_device *device, const char *sysattr); int device_get_device_id(sd_device *device, const char **ret); int device_get_devlink_priority(sd_device *device, int *ret); diff --git a/src/libsystemd/sd-device/sd-device.c b/src/libsystemd/sd-device/sd-device.c index 2fbc619..01e66b4 100644 --- a/src/libsystemd/sd-device/sd-device.c +++ b/src/libsystemd/sd-device/sd-device.c @@ -2435,7 +2435,7 @@ int device_get_sysattr_int(sd_device *device, const char *sysattr, int *ret_valu return v > 0; } -int device_get_sysattr_unsigned(sd_device *device, const char *sysattr, unsigned *ret_value) { +int device_get_sysattr_unsigned_full(sd_device *device, const char *sysattr, unsigned base, unsigned *ret_value) { const char *value; int r; @@ -2444,7 +2444,7 @@ int device_get_sysattr_unsigned(sd_device *device, const char *sysattr, unsigned return r; unsigned v; - r = safe_atou(value, &v); + r = safe_atou_full(value, base, &v); if (r < 0) return log_device_debug_errno(device, r, "Failed to parse '%s' attribute: %m", sysattr); diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c index 288798a..b6899df 100644 --- a/src/libsystemd/sd-event/sd-event.c +++ b/src/libsystemd/sd-event/sd-event.c @@ -2415,7 +2415,7 @@ static int inode_data_realize_watch(sd_event *e, struct inode_data *d) { wd = inotify_add_watch_fd(d->inotify_data->fd, d->fd, combined_mask); if (wd < 0) - return -errno; + return wd; if (d->wd < 0) { r = hashmap_put(d->inotify_data->wd, INT_TO_PTR(wd), d); @@ -2637,7 +2637,7 @@ _public_ int sd_event_source_get_io_fd(sd_event_source *s) { } _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) { - int r; + int saved_fd, r; assert_return(s, -EINVAL); assert_return(fd >= 0, -EBADF); @@ -2647,16 +2647,12 @@ _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) { if (s->io.fd == fd) return 0; - if (event_source_is_offline(s)) { - s->io.fd = fd; - s->io.registered = false; - } else { - int saved_fd; + saved_fd = s->io.fd; + s->io.fd = fd; - saved_fd = s->io.fd; - assert(s->io.registered); + assert(event_source_is_offline(s) == !s->io.registered); - s->io.fd = fd; + if (s->io.registered) { s->io.registered = false; r = source_io_register(s, s->enabled, s->io.events); @@ -2669,6 +2665,9 @@ _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) { (void) epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL); } + if (s->io.owned) + safe_close(saved_fd); + return 0; } diff --git a/src/libsystemd/sd-event/test-event.c b/src/libsystemd/sd-event/test-event.c index 63d3ee7..cc3d84e 100644 --- a/src/libsystemd/sd-event/test-event.c +++ b/src/libsystemd/sd-event/test-event.c @@ -828,6 +828,24 @@ TEST(fork) { assert_se(r >= 0); } +TEST(sd_event_source_set_io_fd) { + _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + _cleanup_close_pair_ int pfd_a[2] = EBADF_PAIR, pfd_b[2] = EBADF_PAIR; + + assert_se(sd_event_default(&e) >= 0); + + assert_se(pipe2(pfd_a, O_CLOEXEC) >= 0); + assert_se(pipe2(pfd_b, O_CLOEXEC) >= 0); + + assert_se(sd_event_add_io(e, &s, pfd_a[0], EPOLLIN, NULL, INT_TO_PTR(-ENOANO)) >= 0); + assert_se(sd_event_source_set_io_fd_own(s, true) >= 0); + TAKE_FD(pfd_a[0]); + + assert_se(sd_event_source_set_io_fd(s, pfd_b[0]) >= 0); + TAKE_FD(pfd_b[0]); +} + static int hup_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) { unsigned *c = userdata; diff --git a/src/libsystemd/sd-id128/id128-util.c b/src/libsystemd/sd-id128/id128-util.c index 94bfd70..b9714ee 100644 --- a/src/libsystemd/sd-id128/id128-util.c +++ b/src/libsystemd/sd-id128/id128-util.c @@ -138,7 +138,7 @@ int id128_read_at(int dir_fd, const char *path, Id128Flag f, sd_id128_t *ret) { assert(dir_fd >= 0 || dir_fd == AT_FDCWD); assert(path); - fd = xopenat(dir_fd, path, O_RDONLY|O_CLOEXEC|O_NOCTTY, /* xopen_flags = */ 0, /* mode = */ 0); + fd = xopenat(dir_fd, path, O_RDONLY|O_CLOEXEC|O_NOCTTY); if (fd < 0) return fd; @@ -184,7 +184,7 @@ int id128_write_at(int dir_fd, const char *path, Id128Flag f, sd_id128_t id) { assert(dir_fd >= 0 || dir_fd == AT_FDCWD); assert(path); - fd = xopenat(dir_fd, path, O_WRONLY|O_CREAT|O_CLOEXEC|O_NOCTTY|O_TRUNC, /* xopen_flags = */ 0, 0444); + fd = xopenat_full(dir_fd, path, O_WRONLY|O_CREAT|O_CLOEXEC|O_NOCTTY|O_TRUNC, /* xopen_flags = */ 0, 0444); if (fd < 0) return fd; diff --git a/src/libsystemd/sd-journal/journal-file.c b/src/libsystemd/sd-journal/journal-file.c index d2493a0..08cbf86 100644 --- a/src/libsystemd/sd-journal/journal-file.c +++ b/src/libsystemd/sd-journal/journal-file.c @@ -639,7 +639,7 @@ static int journal_file_verify_header(JournalFile *f) { return -ENODATA; if (!VALID_REALTIME(le64toh(f->header->tail_entry_realtime))) return -ENODATA; - if (!VALID_MONOTONIC(le64toh(f->header->tail_entry_realtime))) + if (!VALID_MONOTONIC(le64toh(f->header->tail_entry_monotonic))) return -ENODATA; } else { /* Otherwise, the fields must be zero. */ @@ -650,7 +650,7 @@ static int journal_file_verify_header(JournalFile *f) { return -ENODATA; if (f->header->tail_entry_realtime != 0) return -ENODATA; - if (f->header->tail_entry_realtime != 0) + if (f->header->tail_entry_monotonic != 0) return -ENODATA; } } @@ -736,8 +736,9 @@ int journal_file_fstat(JournalFile *f) { return r; /* Refuse appending to files that are already deleted */ - if (f->last_stat.st_nlink <= 0) - return -EIDRM; + r = stat_verify_linked(&f->last_stat); + if (r < 0) + return r; return 0; } @@ -2532,7 +2533,7 @@ int journal_file_append_entry( ts->realtime); if (!VALID_MONOTONIC(ts->monotonic)) return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), - "Invalid monotomic timestamp %" PRIu64 ", refusing entry.", + "Invalid monotonic timestamp %" PRIu64 ", refusing entry.", ts->monotonic); } else { dual_timestamp_now(&_ts); diff --git a/src/libsystemd/sd-journal/journal-verify.c b/src/libsystemd/sd-journal/journal-verify.c index bdaa01d..b5ce55a 100644 --- a/src/libsystemd/sd-journal/journal-verify.c +++ b/src/libsystemd/sd-journal/journal-verify.c @@ -162,7 +162,7 @@ static int journal_file_object_verify(JournalFile *f, uint64_t offset, Object *o int r; if (le64toh(o->data.entry_offset) == 0) - warning(offset, "Unused data (entry_offset==0)"); + debug(offset, "Unused data (entry_offset==0)"); if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0)) { error(offset, "Bad n_entries: %"PRIu64, le64toh(o->data.n_entries)); diff --git a/src/libsystemd/sd-journal/sd-journal.c b/src/libsystemd/sd-journal/sd-journal.c index 6b9ff0a..ca1ef0c 100644 --- a/src/libsystemd/sd-journal/sd-journal.c +++ b/src/libsystemd/sd-journal/sd-journal.c @@ -1720,7 +1720,7 @@ static void directory_watch(sd_journal *j, Directory *m, int fd, uint32_t mask) m->wd = inotify_add_watch_fd(j->inotify_fd, fd, mask); if (m->wd < 0) { - log_debug_errno(errno, "Failed to watch journal directory '%s', ignoring: %m", m->path); + log_debug_errno(m->wd, "Failed to watch journal directory '%s', ignoring: %m", m->path); return; } diff --git a/src/login/logind-dbus.c b/src/login/logind-dbus.c index ec1f2f3..cd2db2d 100644 --- a/src/login/logind-dbus.c +++ b/src/login/logind-dbus.c @@ -2072,7 +2072,7 @@ static int method_do_shutdown_or_sleep( case SLEEP_RESUME_NOT_SUPPORTED: return sd_bus_error_set(error, BUS_ERROR_SLEEP_VERB_NOT_SUPPORTED, - "Not running on EFI and resume= is not set. No available method to resume from hibernation"); + "Not running on EFI and resume= is not set, or noresume is set. No available method to resume from hibernation"); case SLEEP_NOT_ENOUGH_SWAP_SPACE: return sd_bus_error_set(error, BUS_ERROR_SLEEP_VERB_NOT_SUPPORTED, diff --git a/src/network/networkd-state-file.c b/src/network/networkd-state-file.c index 3a95ba8..bba84bb 100644 --- a/src/network/networkd-state-file.c +++ b/src/network/networkd-state-file.c @@ -127,7 +127,7 @@ static int link_put_dns(Link *link, OrderedSet **s) { NDiscRDNSS *a; SET_FOREACH(a, link->ndisc_rdnss) { - r = ordered_set_put_in6_addrv(s, &a->router, 1); + r = ordered_set_put_in6_addrv(s, &a->address, 1); if (r < 0) return r; } @@ -190,7 +190,7 @@ static int link_put_sip(Link *link, OrderedSet **s) { assert(link->network); assert(s); - if (link->dhcp_lease && link->network->dhcp_use_ntp) { + if (link->dhcp_lease && link->network->dhcp_use_sip) { const struct in_addr *addresses; r = sd_dhcp_lease_get_sip(link->dhcp_lease, &addresses); diff --git a/src/network/tc/qdisc.c b/src/network/tc/qdisc.c index f20f410..f9b9437 100644 --- a/src/network/tc/qdisc.c +++ b/src/network/tc/qdisc.c @@ -293,14 +293,20 @@ QDisc* qdisc_drop(QDisc *qdisc) { link = ASSERT_PTR(qdisc->link); + qdisc_mark(qdisc); /* To avoid stack overflow. */ + /* also drop all child classes assigned to the qdisc. */ SET_FOREACH(tclass, link->tclasses) { + if (tclass_is_marked(tclass)) + continue; + if (TC_H_MAJ(tclass->classid) != qdisc->handle) continue; tclass_drop(tclass); } + qdisc_unmark(qdisc); qdisc_enter_removed(qdisc); if (qdisc->state == 0) { diff --git a/src/network/tc/tclass.c b/src/network/tc/tclass.c index 0a5fec0..394e06d 100644 --- a/src/network/tc/tclass.c +++ b/src/network/tc/tclass.c @@ -260,14 +260,20 @@ TClass* tclass_drop(TClass *tclass) { link = ASSERT_PTR(tclass->link); + tclass_mark(tclass); /* To avoid stack overflow. */ + /* Also drop all child qdiscs assigned to the class. */ SET_FOREACH(qdisc, link->qdiscs) { + if (qdisc_is_marked(qdisc)) + continue; + if (qdisc->parent != tclass->classid) continue; qdisc_drop(qdisc); } + tclass_unmark(tclass); tclass_enter_removed(tclass); if (tclass->state == 0) { diff --git a/src/partition/repart.c b/src/partition/repart.c index 5487aaf..4fabe1b 100644 --- a/src/partition/repart.c +++ b/src/partition/repart.c @@ -3839,9 +3839,9 @@ static int partition_encrypt(Context *context, Partition *p, PartitionTarget *ta return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Must provide all PCR values when using TPM2 device key."); } else { - r = tpm2_context_new(arg_tpm2_device, &tpm2_context); + r = tpm2_context_new_or_warn(arg_tpm2_device, &tpm2_context); if (r < 0) - return log_error_errno(r, "Failed to create TPM2 context: %m"); + return r; if (!tpm2_pcr_values_has_all_values(arg_tpm2_hash_pcr_values, arg_tpm2_n_hash_pcr_values)) { r = tpm2_pcr_read_missing_values(tpm2_context, arg_tpm2_hash_pcr_values, arg_tpm2_n_hash_pcr_values); diff --git a/src/pcrextend/pcrextend.c b/src/pcrextend/pcrextend.c index 1295949..394c258 100644 --- a/src/pcrextend/pcrextend.c +++ b/src/pcrextend/pcrextend.c @@ -199,7 +199,7 @@ static int extend_now(unsigned pcr, const void *data, size_t size, Tpm2Userspace _cleanup_(tpm2_context_unrefp) Tpm2Context *c = NULL; int r; - r = tpm2_context_new(arg_tpm2_device, &c); + r = tpm2_context_new_or_warn(arg_tpm2_device, &c); if (r < 0) return r; diff --git a/src/pcrlock/pcrlock.c b/src/pcrlock/pcrlock.c index bdc6bbd..dde4dd9 100644 --- a/src/pcrlock/pcrlock.c +++ b/src/pcrlock/pcrlock.c @@ -1194,7 +1194,7 @@ static int event_log_read_pcrs(EventLog *el) { assert(el); - r = tpm2_context_new(NULL, &tc); + r = tpm2_context_new_or_warn(/* device= */ NULL, &tc); if (r < 0) return r; @@ -4281,9 +4281,9 @@ static int verb_make_policy(int argc, char *argv[], void *userdata) { } _cleanup_(tpm2_context_unrefp) Tpm2Context *tc = NULL; - r = tpm2_context_new(NULL, &tc); + r = tpm2_context_new_or_warn(/* device= */ NULL, &tc); if (r < 0) - return log_error_errno(r, "Failed to allocate TPM2 context: %m"); + return r; if (!tpm2_supports_command(tc, TPM2_CC_PolicyAuthorizeNV)) return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "TPM2 does not support PolicyAuthorizeNV command, refusing."); @@ -4610,7 +4610,7 @@ static int undefine_policy_nv_index( assert(srk_blob); _cleanup_(tpm2_context_unrefp) Tpm2Context *tc = NULL; - r = tpm2_context_new(NULL, &tc); + r = tpm2_context_new_or_warn(/* device= */ NULL, &tc); if (r < 0) return r; diff --git a/src/portable/portable.c b/src/portable/portable.c index 6054f0f..3b2a379 100644 --- a/src/portable/portable.c +++ b/src/portable/portable.c @@ -1623,9 +1623,8 @@ int portable_attach( return 0; } -static bool marker_matches_images(const char *marker, const char *name_or_path, char **extension_image_paths) { +static bool marker_matches_images(const char *marker, const char *name_or_path, char **extension_image_paths, bool match_all) { _cleanup_strv_free_ char **root_and_extensions = NULL; - const char *a; int r; assert(marker); @@ -1635,7 +1634,9 @@ static bool marker_matches_images(const char *marker, const char *name_or_path, * list of images/paths. We enforce strict 1:1 matching, so that we are sure * we are detaching exactly what was attached. * For each image, starting with the root, we look for a token in the marker, - * and return a negative answer on any non-matching combination. */ + * and return a negative answer on any non-matching combination. + * If a partial match is allowed, then return immediately once it is found, otherwise + * ensure that everything matches. */ root_and_extensions = strv_new(name_or_path); if (!root_and_extensions) @@ -1645,70 +1646,33 @@ static bool marker_matches_images(const char *marker, const char *name_or_path, if (r < 0) return r; - STRV_FOREACH(image_name_or_path, root_and_extensions) { - _cleanup_free_ char *image = NULL; + /* Ensure the number of images passed matches the number of images listed in the marker */ + while (!isempty(marker)) + STRV_FOREACH(image_name_or_path, root_and_extensions) { + _cleanup_free_ char *image = NULL, *base_image = NULL, *base_image_name_or_path = NULL; - r = extract_first_word(&marker, &image, ":", EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE); - if (r < 0) - return log_debug_errno(r, "Failed to parse marker: %s", marker); - if (r == 0) - return false; - - a = last_path_component(image); - - if (image_name_is_valid(*image_name_or_path)) { - const char *e, *underscore; - - /* We shall match against an image name. In that case let's compare the last component, and optionally - * allow either a suffix of ".raw" or a series of "/". - * But allow matching on a different version of the same image, when a "_" is used as a separator. */ - underscore = strchr(*image_name_or_path, '_'); - if (underscore) { - if (strneq(a, *image_name_or_path, underscore - *image_name_or_path)) - continue; - return false; - } - - e = startswith(a, *image_name_or_path); - if (!e) - return false; - - if(!(e[strspn(e, "/")] == 0 || streq(e, ".raw"))) - return false; - } else { - const char *b, *underscore; - size_t l; - - /* We shall match against a path. Let's ignore any prefix here though, as often there are many ways to - * reach the same file. However, in this mode, let's validate any file suffix. - * But also ensure that we don't fail if both components don't have a '/' at all - * (strcspn returns the full length of the string in that case, which might not - * match as the versions might differ). */ - - l = strcspn(a, "/"); - b = last_path_component(*image_name_or_path); - - if ((a[l] != '/') != !strchr(b, '/')) /* One is a directory, the other is not */ + r = extract_first_word(&marker, &image, ":", EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE); + if (r < 0) + return log_debug_errno(r, "Failed to parse marker: %s", marker); + if (r == 0) return false; - if (a[l] != 0 && strcspn(b, "/") != l) - return false; + r = path_extract_image_name(image, &base_image); + if (r < 0) + return log_debug_errno(r, "Failed to extract image name from %s, ignoring: %m", image); - underscore = strchr(b, '_'); - if (underscore) - l = underscore - b; - else { /* Either component could be versioned */ - underscore = strchr(a, '_'); - if (underscore) - l = underscore - a; - } + r = path_extract_image_name(*image_name_or_path, &base_image_name_or_path); + if (r < 0) + return log_debug_errno(r, "Failed to extract image name from %s, ignoring: %m", *image_name_or_path); - if (!strneq(a, b, l)) - return false; + if (!streq(base_image, base_image_name_or_path)) { + if (match_all) + return false; + } else if (!match_all) + return true; } - } - return true; + return match_all; } static int test_chroot_dropin( @@ -1763,7 +1727,9 @@ static int test_chroot_dropin( if (!name_or_path) r = true; else - r = marker_matches_images(marker, name_or_path, extension_image_paths); + /* When detaching we want to match exactly on all images, but when inspecting we only need + * to get the state of one component */ + r = marker_matches_images(marker, name_or_path, extension_image_paths, ret_marker != NULL); if (ret_marker) *ret_marker = TAKE_PTR(marker); diff --git a/src/resolve/resolved-bus.c b/src/resolve/resolved-bus.c index 1ef25ac..75ba29c 100644 --- a/src/resolve/resolved-bus.c +++ b/src/resolve/resolved-bus.c @@ -13,6 +13,7 @@ #include "missing_capability.h" #include "resolved-bus.h" #include "resolved-def.h" +#include "resolved-dns-stream.h" #include "resolved-dns-synthesize.h" #include "resolved-dnssd-bus.h" #include "resolved-dnssd.h" @@ -1832,6 +1833,7 @@ static int bus_method_reset_server_features(sd_bus_message *message, void *userd bus_client_log(message, "server feature reset"); + (void) dns_stream_disconnect_all(m); manager_reset_server_features(m); return sd_bus_reply_method_return(message, NULL); @@ -2218,9 +2220,15 @@ static int match_prepare_for_sleep(sd_bus_message *message, void *userdata, sd_b if (b) return 0; - log_debug("Coming back from suspend, verifying all RRs..."); + log_debug("Coming back from suspend, closing all TCP connections..."); + (void) dns_stream_disconnect_all(m); + + log_debug("Coming back from suspend, resetting all probed server features..."); + manager_reset_server_features(m); + log_debug("Coming back from suspend, verifying all RRs..."); manager_verify_all(m); + return 0; } diff --git a/src/resolve/resolved-dns-cache.c b/src/resolve/resolved-dns-cache.c index a9a6492..e90915e 100644 --- a/src/resolve/resolved-dns-cache.c +++ b/src/resolve/resolved-dns-cache.c @@ -531,6 +531,20 @@ static int dns_cache_put_positive( TAKE_PTR(i); return 0; } +/* https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml */ +/* https://www.iana.org/assignments/locally-served-dns-zones/locally-served-dns-zones.xhtml#transport-independent */ +static bool dns_special_use_domain_invalid_answer(DnsResourceKey *key, int rcode) { + /* Sometimes we know a domain exists, even if broken nameservers say otherwise. Make sure not to + * cache any answers we know are wrong. */ + + /* RFC9462 § 6.4: resolvers SHOULD respond to queries of any type other than SVCB for + * _dns.resolver.arpa. with NODATA and queries of any type for any domain name under resolver.arpa + * with NODATA. */ + if (dns_name_endswith(dns_resource_key_name(key), "resolver.arpa") > 0 && rcode == DNS_RCODE_NXDOMAIN) + return true; + + return false; +} static int dns_cache_put_negative( DnsCache *c, @@ -561,6 +575,8 @@ static int dns_cache_put_negative( return 0; if (dns_type_is_pseudo(key->type)) return 0; + if (dns_special_use_domain_invalid_answer(key, rcode)) + return 0; if (IN_SET(rcode, DNS_RCODE_SUCCESS, DNS_RCODE_NXDOMAIN)) { if (!soa) diff --git a/src/resolve/resolved-dns-query.c b/src/resolve/resolved-dns-query.c index 7eb6b97..16334c6 100644 --- a/src/resolve/resolved-dns-query.c +++ b/src/resolve/resolved-dns-query.c @@ -57,6 +57,21 @@ static void dns_query_candidate_stop(DnsQueryCandidate *c) { } } +static void dns_query_candidate_abandon(DnsQueryCandidate *c) { + DnsTransaction *t; + + assert(c); + + /* Abandon all the DnsTransactions attached to this query */ + + while ((t = set_steal_first(c->transactions))) { + t->wait_for_answer = true; + set_remove(t->notify_query_candidates, c); + set_remove(t->notify_query_candidates_done, c); + dns_transaction_gc(t); + } +} + static DnsQueryCandidate* dns_query_candidate_unlink(DnsQueryCandidate *c) { assert(c); @@ -354,6 +369,16 @@ static void dns_query_stop(DnsQuery *q) { dns_query_candidate_stop(c); } +static void dns_query_abandon(DnsQuery *q) { + assert(q); + + /* Thankfully transactions have their own timeouts */ + event_source_disable(q->timeout_event_source); + + LIST_FOREACH(candidates_by_query, c, q->candidates) + dns_query_candidate_abandon(c); +} + static void dns_query_unlink_candidates(DnsQuery *q) { assert(q); @@ -588,7 +613,7 @@ void dns_query_complete(DnsQuery *q, DnsTransactionState state) { (void) manager_monitor_send(q->manager, q->state, q->answer_rcode, q->answer_errno, q->question_idna, q->question_utf8, q->question_bypass, q->collected_questions, q->answer); - dns_query_stop(q); + dns_query_abandon(q); if (q->complete) q->complete(q); } diff --git a/src/resolve/resolved-dns-rr.c b/src/resolve/resolved-dns-rr.c index 00f7bea..b280a5a 100644 --- a/src/resolve/resolved-dns-rr.c +++ b/src/resolve/resolved-dns-rr.c @@ -181,6 +181,23 @@ bool dns_resource_key_is_dnssd_ptr(const DnsResourceKey *key) { dns_name_endswith(dns_resource_key_name(key), "_udp.local"); } +bool dns_resource_key_is_dnssd_two_label_ptr(const DnsResourceKey *key) { + assert(key); + + /* Check if this is a PTR resource key used in Service Instance + * Enumeration as described in RFC6763 § 4.1, excluding selective + * service names described in RFC6763 § 7.1. */ + + if (key->type != DNS_TYPE_PTR) + return false; + + const char *name = dns_resource_key_name(key); + if (dns_name_parent(&name) <= 0) + return false; + + return dns_name_equal(name, "_tcp.local") || dns_name_equal(name, "_udp.local"); +} + int dns_resource_key_equal(const DnsResourceKey *a, const DnsResourceKey *b) { int r; diff --git a/src/resolve/resolved-dns-rr.h b/src/resolve/resolved-dns-rr.h index fd15cc3..1a12933 100644 --- a/src/resolve/resolved-dns-rr.h +++ b/src/resolve/resolved-dns-rr.h @@ -305,6 +305,7 @@ DnsResourceKey* dns_resource_key_unref(DnsResourceKey *key); const char* dns_resource_key_name(const DnsResourceKey *key); bool dns_resource_key_is_address(const DnsResourceKey *key); bool dns_resource_key_is_dnssd_ptr(const DnsResourceKey *key); +bool dns_resource_key_is_dnssd_two_label_ptr(const DnsResourceKey *key); int dns_resource_key_equal(const DnsResourceKey *a, const DnsResourceKey *b); int dns_resource_key_match_rr(const DnsResourceKey *key, DnsResourceRecord *rr, const char *search_domain); int dns_resource_key_match_cname_or_dname(const DnsResourceKey *key, const DnsResourceKey *cname, const char *search_domain); diff --git a/src/resolve/resolved-dns-scope.c b/src/resolve/resolved-dns-scope.c index 2e8b3e5..af8e9cd 100644 --- a/src/resolve/resolved-dns-scope.c +++ b/src/resolve/resolved-dns-scope.c @@ -424,7 +424,15 @@ static int dns_scope_socket( return r; } - if (ifindex != 0) { + bool addr_is_nonlocal = s->link && + !manager_find_link_address(s->manager, sa.sa.sa_family, sockaddr_in_addr(&sa.sa)) && + in_addr_is_localhost(sa.sa.sa_family, sockaddr_in_addr(&sa.sa)) == 0; + + if (addr_is_nonlocal && ifindex != 0) { + /* As a special exception we don't use UNICAST_IF if we notice that the specified IP address + * is on the local host. Otherwise, destination addresses on the local host result in + * EHOSTUNREACH, since Linux won't send the packets out of the specified interface, but + * delivers them directly to the local socket. */ r = socket_set_unicast_if(fd, sa.sa.sa_family, ifindex); if (r < 0) return r; @@ -463,19 +471,13 @@ static int dns_scope_socket( else { bool bound = false; - /* Let's temporarily bind the socket to the specified ifindex. The kernel currently takes - * only the SO_BINDTODEVICE/SO_BINDTOINDEX ifindex into account when making routing decisions + /* Let's temporarily bind the socket to the specified ifindex. Older kernels only take + * the SO_BINDTODEVICE/SO_BINDTOINDEX ifindex into account when making routing decisions * in connect() — and not IP_UNICAST_IF. We don't really want any of the other semantics of * SO_BINDTODEVICE/SO_BINDTOINDEX, hence we immediately unbind the socket after the fact * again. - * - * As a special exception we don't do this if we notice that the specified IP address is on - * the local host. SO_BINDTODEVICE in combination with destination addresses on the local - * host result in EHOSTUNREACH, since Linux won't send the packets out of the specified - * interface, but delivers them directly to the local socket. */ - if (s->link && - !manager_find_link_address(s->manager, sa.sa.sa_family, sockaddr_in_addr(&sa.sa)) && - in_addr_is_localhost(sa.sa.sa_family, sockaddr_in_addr(&sa.sa)) == 0) { + */ + if (addr_is_nonlocal) { r = socket_bind_to_ifindex(fd, ifindex); if (r < 0) return r; @@ -589,6 +591,29 @@ static DnsScopeMatch match_subnet_reverse_lookups( return _DNS_SCOPE_MATCH_INVALID; } +/* https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml */ +/* https://www.iana.org/assignments/locally-served-dns-zones/locally-served-dns-zones.xhtml */ +static bool dns_refuse_special_use_domain(const char *domain, DnsQuestion *question) { + /* RFC9462 § 6.4: resolvers SHOULD respond to queries of any type other than SVCB for + * _dns.resolver.arpa. with NODATA and queries of any type for any domain name under + * resolver.arpa with NODATA. */ + if (dns_name_equal(domain, "_dns.resolver.arpa") > 0) { + DnsResourceKey *t; + + /* Only SVCB is permitted to _dns.resolver.arpa */ + DNS_QUESTION_FOREACH(t, question) + if (t->type == DNS_TYPE_SVCB) + return false; + + return true; + } + + if (dns_name_endswith(domain, "resolver.arpa") > 0) + return true; + + return false; +} + DnsScopeMatch dns_scope_good_domain( DnsScope *s, DnsQuery *q) { @@ -601,6 +626,7 @@ DnsScopeMatch dns_scope_good_domain( /* This returns the following return values: * * DNS_SCOPE_NO → This scope is not suitable for lookups of this domain, at all + * DNS_SCOPE_LAST_RESORT→ This scope is not suitable, unless we have no alternative * DNS_SCOPE_MAYBE → This scope is suitable, but only if nothing else wants it * DNS_SCOPE_YES_BASE+n → This scope is suitable, and 'n' suffix labels match * @@ -643,6 +669,10 @@ DnsScopeMatch dns_scope_good_domain( if (dns_name_dont_resolve(domain)) return DNS_SCOPE_NO; + /* Avoid asking invalid questions of some special use domains */ + if (dns_refuse_special_use_domain(domain, question)) + return DNS_SCOPE_NO; + /* Never go to network for the _gateway, _outbound, _localdnsstub, _localdnsproxy domain — they're something special, synthesized locally. */ if (is_gateway_hostname(domain) || is_outbound_hostname(domain) || @@ -749,7 +779,7 @@ DnsScopeMatch dns_scope_good_domain( if ((s->family == AF_INET && dns_name_endswith(domain, "in-addr.arpa") > 0) || (s->family == AF_INET6 && dns_name_endswith(domain, "ip6.arpa") > 0)) - return DNS_SCOPE_MAYBE; + return DNS_SCOPE_LAST_RESORT; if ((dns_name_endswith(domain, "local") > 0 && /* only resolve names ending in .local via mDNS */ dns_name_equal(domain, "local") == 0 && /* but not the single-label "local" name itself */ @@ -772,7 +802,7 @@ DnsScopeMatch dns_scope_good_domain( if ((s->family == AF_INET && dns_name_endswith(domain, "in-addr.arpa") > 0) || (s->family == AF_INET6 && dns_name_endswith(domain, "ip6.arpa") > 0)) - return DNS_SCOPE_MAYBE; + return DNS_SCOPE_LAST_RESORT; if ((dns_name_is_single_label(domain) && /* only resolve single label names via LLMNR */ dns_name_equal(domain, "local") == 0 && /* don't resolve "local" with LLMNR, it's the top-level domain of mDNS after all, see above */ @@ -1459,9 +1489,10 @@ int dns_scope_announce(DnsScope *scope, bool goodbye) { continue; } - /* Collect service types for _services._dns-sd._udp.local RRs in a set */ + /* Collect service types for _services._dns-sd._udp.local RRs in a set. Only two-label names + * (not selective names) are considered according to RFC6763 § 9. */ if (!scope->announced && - dns_resource_key_is_dnssd_ptr(z->rr->key)) { + dns_resource_key_is_dnssd_two_label_ptr(z->rr->key)) { if (!set_contains(types, dns_resource_key_name(z->rr->key))) { r = set_ensure_put(&types, &dns_name_hash_ops, dns_resource_key_name(z->rr->key)); if (r < 0) diff --git a/src/resolve/resolved-dns-scope.h b/src/resolve/resolved-dns-scope.h index ca33fd0..b1d1206 100644 --- a/src/resolve/resolved-dns-scope.h +++ b/src/resolve/resolved-dns-scope.h @@ -18,6 +18,7 @@ typedef struct DnsScope DnsScope; typedef enum DnsScopeMatch { DNS_SCOPE_NO, + DNS_SCOPE_LAST_RESORT, DNS_SCOPE_MAYBE, DNS_SCOPE_YES_BASE, /* Add the number of matching labels to this */ DNS_SCOPE_YES_END = DNS_SCOPE_YES_BASE + DNS_N_LABELS_MAX, diff --git a/src/resolve/resolved-dns-stream.c b/src/resolve/resolved-dns-stream.c index ddd1db5..056ba77 100644 --- a/src/resolve/resolved-dns-stream.c +++ b/src/resolve/resolved-dns-stream.c @@ -593,3 +593,44 @@ void dns_stream_detach(DnsStream *s) { dns_server_unref_stream(s->server); } + +DEFINE_PRIVATE_HASH_OPS_WITH_KEY_DESTRUCTOR( + dns_stream_hash_ops, + void, + trivial_hash_func, + trivial_compare_func, + dns_stream_unref); + +int dns_stream_disconnect_all(Manager *m) { + _cleanup_(set_freep) Set *closed = NULL; + int r; + + assert(m); + + /* Terminates all TCP connections (called after system suspend for example, to speed up recovery) */ + + log_info("Closing all remaining TCP connections."); + + bool restart; + do { + restart = false; + + LIST_FOREACH(streams, s, m->dns_streams) { + r = set_ensure_put(&closed, &dns_stream_hash_ops, s); + if (r < 0) + return log_oom(); + if (r > 0) { + /* Haven't seen this one before. Close it. */ + dns_stream_ref(s); + (void) dns_stream_complete(s, ECONNRESET); + + /* This might have a ripple effect, let's hence no look at the list further, + * but scan from the beginning again */ + restart = true; + break; + } + } + } while (restart); + + return 0; +} diff --git a/src/resolve/resolved-dns-stream.h b/src/resolve/resolved-dns-stream.h index ba4a59e..912b9bf 100644 --- a/src/resolve/resolved-dns-stream.h +++ b/src/resolve/resolved-dns-stream.h @@ -126,3 +126,4 @@ static inline bool DNS_STREAM_QUEUED(DnsStream *s) { } void dns_stream_detach(DnsStream *s); +int dns_stream_disconnect_all(Manager *m); diff --git a/src/resolve/resolved-dns-stub.c b/src/resolve/resolved-dns-stub.c index c59e3b7..10b35da 100644 --- a/src/resolve/resolved-dns-stub.c +++ b/src/resolve/resolved-dns-stub.c @@ -837,12 +837,20 @@ static void dns_stub_query_complete(DnsQuery *query) { break; case DNS_TRANSACTION_NO_SERVERS: + /* We're not configured to give answers for this question. Refuse it. */ + (void) dns_stub_send_reply(q, DNS_RCODE_REFUSED); + break; + + case DNS_TRANSACTION_RR_TYPE_UNSUPPORTED: + /* This RR Type is not implemented */ + (void) dns_stub_send_reply(q, DNS_RCODE_NOTIMP); + break; + case DNS_TRANSACTION_INVALID_REPLY: case DNS_TRANSACTION_ERRNO: case DNS_TRANSACTION_ABORTED: case DNS_TRANSACTION_DNSSEC_FAILED: case DNS_TRANSACTION_NO_TRUST_ANCHOR: - case DNS_TRANSACTION_RR_TYPE_UNSUPPORTED: case DNS_TRANSACTION_NETWORK_DOWN: case DNS_TRANSACTION_NO_SOURCE: case DNS_TRANSACTION_STUB_LOOP: diff --git a/src/resolve/resolved-dns-synthesize.c b/src/resolve/resolved-dns-synthesize.c index 5bde29c..6144dc0 100644 --- a/src/resolve/resolved-dns-synthesize.c +++ b/src/resolve/resolved-dns-synthesize.c @@ -463,7 +463,7 @@ int dns_synthesize_answer( name = dns_resource_key_name(key); - if (dns_name_is_root(name)) { + if (dns_name_is_root(name) || dns_name_endswith(name, "resolver.arpa") > 0) { /* Do nothing. */ } else if (dns_name_dont_resolve(name)) { diff --git a/src/resolve/resolved-dns-transaction.c b/src/resolve/resolved-dns-transaction.c index 8ff5653..ad8b88e 100644 --- a/src/resolve/resolved-dns-transaction.c +++ b/src/resolve/resolved-dns-transaction.c @@ -175,6 +175,9 @@ DnsTransaction* dns_transaction_gc(DnsTransaction *t) { if (t->block_gc > 0) return t; + if (t->wait_for_answer && IN_SET(t->state, DNS_TRANSACTION_PENDING, DNS_TRANSACTION_VALIDATING)) + return t; + if (set_isempty(t->notify_query_candidates) && set_isempty(t->notify_query_candidates_done) && set_isempty(t->notify_zone_items) && @@ -2229,7 +2232,7 @@ static int dns_transaction_add_dnssec_transaction(DnsTransaction *t, DnsResource return 1; } -static int dns_transaction_request_dnssec_rr(DnsTransaction *t, DnsResourceKey *key) { +static int dns_transaction_request_dnssec_rr_full(DnsTransaction *t, DnsResourceKey *key, DnsTransaction **ret) { _cleanup_(dns_answer_unrefp) DnsAnswer *a = NULL; DnsTransaction *aux; int r; @@ -2246,13 +2249,18 @@ static int dns_transaction_request_dnssec_rr(DnsTransaction *t, DnsResourceKey * if (r < 0) return r; + if (ret) + *ret = NULL; return 0; } /* This didn't work, ask for it via the network/cache then. */ r = dns_transaction_add_dnssec_transaction(t, key, &aux); - if (r == -ELOOP) /* This would result in a cyclic dependency */ + if (r == -ELOOP) { /* This would result in a cyclic dependency */ + if (ret) + *ret = NULL; return 0; + } if (r < 0) return r; @@ -2260,11 +2268,19 @@ static int dns_transaction_request_dnssec_rr(DnsTransaction *t, DnsResourceKey * r = dns_transaction_go(aux); if (r < 0) return r; + if (ret) + *ret = aux; } return 1; } +static int dns_transaction_request_dnssec_rr(DnsTransaction *t, DnsResourceKey *key) { + assert(t); + assert(key); + return dns_transaction_request_dnssec_rr_full(t, key, NULL); +} + static int dns_transaction_negative_trust_anchor_lookup(DnsTransaction *t, const char *name) { int r; @@ -2365,6 +2381,8 @@ static bool dns_transaction_dnssec_supported_full(DnsTransaction *t) { int dns_transaction_request_dnssec_keys(DnsTransaction *t) { DnsResourceRecord *rr; + /* Have we already requested a record that would be sufficient to validate an insecure delegation? */ + bool chased_insecure = false; int r; assert(t); @@ -2377,11 +2395,11 @@ int dns_transaction_request_dnssec_keys(DnsTransaction *t) { * - For RRSIG we get the matching DNSKEY * - For DNSKEY we get the matching DS * - For unsigned SOA/NS we get the matching DS - * - For unsigned CNAME/DNAME/DS we get the parent SOA RR - * - For other unsigned RRs we get the matching SOA RR + * - For unsigned CNAME/DNAME/DS we get the parent DS RR + * - For other unsigned RRs we get the matching DS RR * - For SOA/NS queries with no matching response RR, and no NSEC/NSEC3, the DS RR - * - For DS queries with no matching response RRs, and no NSEC/NSEC3, the parent's SOA RR - * - For other queries with no matching response RRs, and no NSEC/NSEC3, the SOA RR + * - For DS queries with no matching response RRs, and no NSEC/NSEC3, the parent's DS RR + * - For other queries with no matching response RRs, and no NSEC/NSEC3, the DS RR */ if (FLAGS_SET(t->query_flags, SD_RESOLVED_NO_VALIDATE) || t->scope->dnssec_mode == DNSSEC_NO) @@ -2408,6 +2426,7 @@ int dns_transaction_request_dnssec_keys(DnsTransaction *t) { case DNS_TYPE_RRSIG: { /* For each RRSIG we request the matching DNSKEY */ _cleanup_(dns_resource_key_unrefp) DnsResourceKey *dnskey = NULL; + DnsTransaction *aux; /* If this RRSIG is about a DNSKEY RR and the * signer is the same as the owner, then we @@ -2444,9 +2463,22 @@ int dns_transaction_request_dnssec_keys(DnsTransaction *t) { log_debug("Requesting DNSKEY to validate transaction %" PRIu16" (%s, RRSIG with key tag: %" PRIu16 ").", t->id, dns_resource_key_name(rr->key), rr->rrsig.key_tag); - r = dns_transaction_request_dnssec_rr(t, dnskey); + r = dns_transaction_request_dnssec_rr_full(t, dnskey, &aux); if (r < 0) return r; + + /* If we are requesting a DNSKEY, we can anticiapte that we will want the matching DS + * in the near future. Let's request it in advance so we don't have to wait in the + * common case. */ + if (aux) { + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *ds = + dns_resource_key_new(rr->key->class, DNS_TYPE_DS, dns_resource_key_name(dnskey)); + if (!ds) + return -ENOMEM; + r = dns_transaction_request_dnssec_rr(t, ds); + if (r < 0) + return r; + } break; } @@ -2521,6 +2553,7 @@ int dns_transaction_request_dnssec_keys(DnsTransaction *t) { if (r > 0) continue; + chased_insecure = true; ds = dns_resource_key_new(rr->key->class, DNS_TYPE_DS, dns_resource_key_name(rr->key)); if (!ds) return -ENOMEM; @@ -2537,11 +2570,11 @@ int dns_transaction_request_dnssec_keys(DnsTransaction *t) { case DNS_TYPE_DS: case DNS_TYPE_CNAME: case DNS_TYPE_DNAME: { - _cleanup_(dns_resource_key_unrefp) DnsResourceKey *soa = NULL; + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *ds = NULL; const char *name; /* CNAMEs and DNAMEs cannot be located at a - * zone apex, hence ask for the parent SOA for + * zone apex, hence ask for the parent DS for * unsigned CNAME/DNAME RRs, maybe that's the * apex. But do all that only if this is * actually a response to our original @@ -2575,13 +2608,13 @@ int dns_transaction_request_dnssec_keys(DnsTransaction *t) { if (r == 0) continue; - soa = dns_resource_key_new(rr->key->class, DNS_TYPE_SOA, name); - if (!soa) + ds = dns_resource_key_new(rr->key->class, DNS_TYPE_DS, name); + if (!ds) return -ENOMEM; - log_debug("Requesting parent SOA to validate transaction %" PRIu16 " (%s, unsigned CNAME/DNAME/DS RRset).", + log_debug("Requesting parent DS to validate transaction %" PRIu16 " (%s, unsigned CNAME/DNAME/DS RRset).", t->id, dns_resource_key_name(rr->key)); - r = dns_transaction_request_dnssec_rr(t, soa); + r = dns_transaction_request_dnssec_rr(t, ds); if (r < 0) return r; @@ -2589,11 +2622,11 @@ int dns_transaction_request_dnssec_keys(DnsTransaction *t) { } default: { - _cleanup_(dns_resource_key_unrefp) DnsResourceKey *soa = NULL; + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *ds = NULL; /* For other unsigned RRsets (including * NSEC/NSEC3!), look for proof the zone is - * unsigned, by requesting the SOA RR of the + * unsigned, by requesting the DS RR of the * zone. However, do so only if they are * directly relevant to our original * question. */ @@ -2610,13 +2643,13 @@ int dns_transaction_request_dnssec_keys(DnsTransaction *t) { if (r > 0) continue; - soa = dns_resource_key_new(rr->key->class, DNS_TYPE_SOA, dns_resource_key_name(rr->key)); - if (!soa) + ds = dns_resource_key_new(rr->key->class, DNS_TYPE_DS, dns_resource_key_name(rr->key)); + if (!ds) return -ENOMEM; - log_debug("Requesting SOA to validate transaction %" PRIu16 " (%s, unsigned non-SOA/NS RRset <%s>).", + log_debug("Requesting DS to validate transaction %" PRIu16 " (%s, unsigned non-SOA/NS RRset <%s>).", t->id, dns_resource_key_name(rr->key), dns_resource_record_to_string(rr)); - r = dns_transaction_request_dnssec_rr(t, soa); + r = dns_transaction_request_dnssec_rr(t, ds); if (r < 0) return r; break; @@ -2631,49 +2664,38 @@ int dns_transaction_request_dnssec_keys(DnsTransaction *t) { if (r < 0) return r; if (r > 0) { - const char *name, *signed_status; - uint16_t type = 0; + const char *name = dns_resource_key_name(dns_transaction_key(t)); + bool was_signed = dns_answer_contains_nsec_or_nsec3(t->answer); - name = dns_resource_key_name(dns_transaction_key(t)); - signed_status = dns_answer_contains_nsec_or_nsec3(t->answer) ? "signed" : "unsigned"; - - /* If this was a SOA or NS request, then check if there's a DS RR for the same domain. Note that this - * could also be used as indication that we are not at a zone apex, but in real world setups there are - * too many broken DNS servers (Hello, incapdns.net!) where non-terminal zones return NXDOMAIN even - * though they have further children. If this was a DS request, then it's signed when the parent zone - * is signed, hence ask the parent SOA in that case. If this was any other RR then ask for the SOA RR, - * to see if that is signed. */ - - if (dns_transaction_key(t)->type == DNS_TYPE_DS) { - r = dns_name_parent(&name); - if (r > 0) { - type = DNS_TYPE_SOA; - log_debug("Requesting parent SOA (%s %s) to validate transaction %" PRIu16 " (%s, %s empty DS response).", - special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), name, t->id, - dns_resource_key_name(dns_transaction_key(t)), signed_status); - } else + /* If the response is empty, seek the DS for this name, just in case we're at a zone cut + * already, unless we just requested the DS, in which case we have to ask the parent to make + * progress. + * + * If this was an SOA or NS request, we could also skip to the parent, but in real world + * setups there are too many broken DNS servers (Hello, incapdns.net!) where non-terminal + * zones return NXDOMAIN even though they have further children. */ + + if (chased_insecure || was_signed) + /* In this case we already reqeusted what we need above. */ + name = NULL; + else if (dns_transaction_key(t)->type == DNS_TYPE_DS) + /* If the DS response is empty, we'll walk up the dns labels requesting DS until we + * find a referral to the SOA or hit it anyway and get a positive DS response. */ + if (dns_name_parent(&name) <= 0) name = NULL; - } else if (IN_SET(dns_transaction_key(t)->type, DNS_TYPE_SOA, DNS_TYPE_NS)) { - - type = DNS_TYPE_DS; - log_debug("Requesting DS (%s %s) to validate transaction %" PRIu16 " (%s, %s empty SOA/NS response).", - special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), name, t->id, name, signed_status); - - } else { - type = DNS_TYPE_SOA; - log_debug("Requesting SOA (%s %s) to validate transaction %" PRIu16 " (%s, %s empty non-SOA/NS/DS response).", - special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), name, t->id, name, signed_status); - } - if (name) { - _cleanup_(dns_resource_key_unrefp) DnsResourceKey *soa = NULL; + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *ds = NULL; + + log_debug("Requesting DS (%s %s) to validate transaction %" PRIu16 " (%s empty response).", + special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), name, t->id, + dns_resource_key_name(dns_transaction_key(t))); - soa = dns_resource_key_new(dns_transaction_key(t)->class, type, name); - if (!soa) + ds = dns_resource_key_new(dns_transaction_key(t)->class, DNS_TYPE_DS, name); + if (!ds) return -ENOMEM; - r = dns_transaction_request_dnssec_rr(t, soa); + r = dns_transaction_request_dnssec_rr(t, ds); if (r < 0) return r; } @@ -2753,7 +2775,6 @@ static int dns_transaction_requires_rrsig(DnsTransaction *t, DnsResourceRecord * DnsTransaction *dt; /* For SOA or NS RRs we look for a matching DS transaction */ - SET_FOREACH(dt, t->dnssec_transactions) { if (dns_transaction_key(dt)->class != rr->key->class) @@ -2761,7 +2782,7 @@ static int dns_transaction_requires_rrsig(DnsTransaction *t, DnsResourceRecord * if (dns_transaction_key(dt)->type != DNS_TYPE_DS) continue; - r = dns_name_equal(dns_resource_key_name(dns_transaction_key(dt)), dns_resource_key_name(rr->key)); + r = dns_name_endswith(dns_resource_key_name(rr->key), dns_resource_key_name(dns_transaction_key(dt))); if (r < 0) return r; if (r == 0) @@ -2790,16 +2811,16 @@ static int dns_transaction_requires_rrsig(DnsTransaction *t, DnsResourceRecord * DnsTransaction *dt; /* - * CNAME/DNAME RRs cannot be located at a zone apex, hence look directly for the parent SOA. + * CNAME/DNAME RRs cannot be located at a zone apex, hence look directly for the parent DS. * - * DS RRs are signed if the parent is signed, hence also look at the parent SOA + * DS RRs are signed if the parent is signed, hence also look at the parent DS */ SET_FOREACH(dt, t->dnssec_transactions) { if (dns_transaction_key(dt)->class != rr->key->class) continue; - if (dns_transaction_key(dt)->type != DNS_TYPE_SOA) + if (dns_transaction_key(dt)->type != DNS_TYPE_DS) continue; if (!parent) { @@ -2817,7 +2838,7 @@ static int dns_transaction_requires_rrsig(DnsTransaction *t, DnsResourceRecord * } } - r = dns_name_equal(dns_resource_key_name(dns_transaction_key(dt)), parent); + r = dns_name_endswith(parent, dns_resource_key_name(dns_transaction_key(dt))); if (r < 0) return r; if (r == 0) @@ -2832,25 +2853,26 @@ static int dns_transaction_requires_rrsig(DnsTransaction *t, DnsResourceRecord * default: { DnsTransaction *dt; - /* Any other kind of RR (including DNSKEY/NSEC/NSEC3). Let's see if our SOA lookup was authenticated */ + /* Any other kind of RR (including DNSKEY/NSEC/NSEC3). Let's see if our DS lookup was authenticated */ SET_FOREACH(dt, t->dnssec_transactions) { - if (dns_transaction_key(dt)->class != rr->key->class) continue; - if (dns_transaction_key(dt)->type != DNS_TYPE_SOA) + if (dns_transaction_key(dt)->type != DNS_TYPE_DS) continue; - r = dns_name_equal(dns_resource_key_name(dns_transaction_key(dt)), dns_resource_key_name(rr->key)); + r = dns_name_endswith(dns_resource_key_name(rr->key), dns_resource_key_name(dns_transaction_key(dt))); if (r < 0) return r; if (r == 0) continue; - /* We found the transaction that was supposed to find the SOA RR for us. It was - * successful, but found no RR for us. This means we are not at a zone cut. In this - * case, we require authentication if the SOA lookup was authenticated too. */ - return FLAGS_SET(dt->answer_query_flags, SD_RESOLVED_AUTHENTICATED); + if (!FLAGS_SET(dt->answer_query_flags, SD_RESOLVED_AUTHENTICATED)) + return false; + + /* We expect this to be signed when the DS record exists, and don't expect it to be + * signed when the DS record is proven not to exist. */ + return dns_answer_match_key(dt->answer, dns_transaction_key(dt), NULL); } return true; @@ -2920,7 +2942,6 @@ static int dns_transaction_requires_nsec(DnsTransaction *t) { char key_str[DNS_RESOURCE_KEY_STRING_MAX]; DnsTransaction *dt; const char *name; - uint16_t type = 0; int r; assert(t); @@ -2955,43 +2976,37 @@ static int dns_transaction_requires_nsec(DnsTransaction *t) { name = dns_resource_key_name(dns_transaction_key(t)); - if (dns_transaction_key(t)->type == DNS_TYPE_DS) { - - /* We got a negative reply for this DS lookup? DS RRs are signed when their parent zone is signed, - * hence check the parent SOA in this case. */ - + if (IN_SET(dns_transaction_key(t)->type, DNS_TYPE_DS, DNS_TYPE_CNAME, DNS_TYPE_DNAME)) { + /* We got a negative reply for this DS/CNAME/DNAME lookup? Check the parent in this case to + * see if this answer should have been signed. */ r = dns_name_parent(&name); if (r < 0) return r; if (r == 0) return true; + } - type = DNS_TYPE_SOA; - - } else if (IN_SET(dns_transaction_key(t)->type, DNS_TYPE_SOA, DNS_TYPE_NS)) - /* We got a negative reply for this SOA/NS lookup? If so, check if there's a DS RR for this */ - type = DNS_TYPE_DS; - else - /* For all other negative replies, check for the SOA lookup */ - type = DNS_TYPE_SOA; - - /* For all other RRs we check the SOA on the same level to see + /* For all other RRs we check the DS on the same level to see * if it's signed. */ SET_FOREACH(dt, t->dnssec_transactions) { - if (dns_transaction_key(dt)->class != dns_transaction_key(t)->class) continue; - if (dns_transaction_key(dt)->type != type) + if (dns_transaction_key(dt)->type != DNS_TYPE_DS) continue; - r = dns_name_equal(dns_resource_key_name(dns_transaction_key(dt)), name); + r = dns_name_endswith(name, dns_resource_key_name(dns_transaction_key(dt))); if (r < 0) return r; if (r == 0) continue; - return FLAGS_SET(dt->answer_query_flags, SD_RESOLVED_AUTHENTICATED); + if (!FLAGS_SET(dt->answer_query_flags, SD_RESOLVED_AUTHENTICATED)) + return false; + + /* We expect this to be signed when the DS record exists, and don't expect it to be signed + * when the DS record is proven not to exist. */ + return dns_answer_match_key(dt->answer, dns_transaction_key(dt), NULL); } /* If in doubt, require NSEC/NSEC3 */ diff --git a/src/resolve/resolved-dns-transaction.h b/src/resolve/resolved-dns-transaction.h index 2fd8720..6be7c5f 100644 --- a/src/resolve/resolved-dns-transaction.h +++ b/src/resolve/resolved-dns-transaction.h @@ -134,6 +134,11 @@ struct DnsTransaction { unsigned block_gc; + /* Set when we're willing to let this transaction live beyond it's usefulness for the original query, + * for caching purposes. This blocks gc while there is still a chance we might still receive an + * answer. */ + bool wait_for_answer; + LIST_FIELDS(DnsTransaction, transactions_by_scope); LIST_FIELDS(DnsTransaction, transactions_by_stream); LIST_FIELDS(DnsTransaction, transactions_by_key); diff --git a/src/resolve/resolved-dns-trust-anchor.c b/src/resolve/resolved-dns-trust-anchor.c index 1703c43..8aea5e1 100644 --- a/src/resolve/resolved-dns-trust-anchor.c +++ b/src/resolve/resolved-dns-trust-anchor.c @@ -165,6 +165,11 @@ static int dns_trust_anchor_add_builtin_negative(DnsTrustAnchor *d) { /* Defined by RFC 8375. The most official choice. */ "home.arpa\0" + /* RFC 9462 doesn't mention DNSSEC, but this domain + * can't really be signed and clients need to validate + * the answer before using it anyway. */ + "resolver.arpa\0" + /* RFC 8880 says because the 'ipv4only.arpa' zone has to * be an insecure delegation, DNSSEC cannot be used to * protect these answers from tampering by malicious diff --git a/src/rpm/macros.systemd.in b/src/rpm/macros.systemd.in index 241e4b9..317e13d 100644 --- a/src/rpm/macros.systemd.in +++ b/src/rpm/macros.systemd.in @@ -13,6 +13,7 @@ %_udevhwdbdir {{UDEV_HWDB_DIR}} %_udevrulesdir {{UDEV_RULES_DIR}} %_journalcatalogdir {{SYSTEMD_CATALOG_DIR}} +%_kernel_install_dir {{KERNEL_INSTALL_DIR}} %_binfmtdir {{BINFMT_DIR}} %_sysctldir {{SYSCTL_DIR}} %_sysusersdir {{SYSUSERS_DIR}} diff --git a/src/shared/base-filesystem.c b/src/shared/base-filesystem.c index 569ef46..a4e2dae 100644 --- a/src/shared/base-filesystem.c +++ b/src/shared/base-filesystem.c @@ -120,13 +120,13 @@ static const BaseFilesystem table[] = { # else # error "Unknown RISC-V ABI" # endif -#elif defined(__s390__) - /* s390-linux-gnu */ #elif defined(__s390x__) { "lib64", 0, "usr/lib/"LIB_ARCH_TUPLE"\0" "usr/lib64\0" "usr/lib\0", "ld-lsb-s390x.so.3" }, # define KNOW_LIB64_DIRS 1 +#elif defined(__s390__) + /* s390-linux-gnu */ #elif defined(__sparc__) #endif /* gcc doesn't allow pragma to be used within constructs, hence log about this separately below */ diff --git a/src/shared/blockdev-util.c b/src/shared/blockdev-util.c index c906aec..7a2dd1c 100644 --- a/src/shared/blockdev-util.c +++ b/src/shared/blockdev-util.c @@ -11,6 +11,7 @@ #include "alloc-util.h" #include "blockdev-util.h" #include "btrfs-util.h" +#include "device-private.h" #include "device-util.h" #include "devnum-util.h" #include "dirent-util.h" @@ -367,24 +368,36 @@ int lock_whole_block_device(dev_t devt, int operation) { } int blockdev_partscan_enabled(int fd) { - _cleanup_free_ char *p = NULL, *buf = NULL; - unsigned long long ull; - struct stat st; - int r; - - /* Checks if partition scanning is correctly enabled on the block device */ + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + unsigned capability; + int r, ext_range; - if (fstat(fd, &st) < 0) - return -errno; + /* Checks if partition scanning is correctly enabled on the block device. + * + * The 'GENHD_FL_NO_PART_SCAN' flag was introduced by + * https://github.com/torvalds/linux/commit/d27769ec3df1a8de9ca450d2dcd72d1ab259ba32 (v3.2). + * But at that time, the flag is also effectively implied when 'minors' element of 'struct gendisk' + * is 1, which can be check with 'ext_range' sysfs attribute. Explicit flag ('GENHD_FL_NO_PART_SCAN') + * can be obtained from 'capability' sysattr. + * + * With https://github.com/torvalds/linux/commit/1ebe2e5f9d68e94c524aba876f27b945669a7879 (v5.17), we + * can check the flag from 'ext_range' sysfs attribute directly. + * + * With https://github.com/torvalds/linux/commit/e81cd5a983bb35dabd38ee472cf3fea1c63e0f23 (v6.3), + * the 'capability' sysfs attribute is deprecated, hence we cannot check the flag from it. + * + * To support both old and new kernels, we need to do the following: first check 'ext_range' sysfs + * attribute, and if '1' we can conclude partition scanning is disabled, otherwise check 'capability' + * sysattr for older version. */ - if (!S_ISBLK(st.st_mode)) - return -ENOTBLK; + assert(fd >= 0); - if (asprintf(&p, "/sys/dev/block/%u:%u/capability", major(st.st_rdev), minor(st.st_rdev)) < 0) - return -ENOMEM; + r = block_device_new_from_fd(fd, 0, &dev); + if (r < 0) + return r; - r = read_one_line_file(p, &buf); - if (r == -ENOENT) /* If the capability file doesn't exist then we are most likely looking at a + r = device_get_sysattr_int(dev, "ext_range", &ext_range); + if (r == -ENOENT) /* If the ext_range file doesn't exist then we are most likely looking at a * partition block device, not the whole block device. And that means we have no * partition scanning on for it (we do for its parent, but not for the partition * itself). */ @@ -392,7 +405,13 @@ int blockdev_partscan_enabled(int fd) { if (r < 0) return r; - r = safe_atollu_full(buf, 16, &ull); + if (ext_range <= 1) /* The valus should be always positive, but the kernel uses '%d' for the + * attribute. Let's gracefully handle zero or negative. */ + return false; + + r = device_get_sysattr_unsigned_full(dev, "capability", 16, &capability); + if (r == -ENOENT) + return false; if (r < 0) return r; @@ -400,7 +419,12 @@ int blockdev_partscan_enabled(int fd) { #define GENHD_FL_NO_PART_SCAN (0x0200) #endif - return !FLAGS_SET(ull, GENHD_FL_NO_PART_SCAN); + /* If 0x200 is set, part scanning is definitely off. */ + if (FLAGS_SET(capability, GENHD_FL_NO_PART_SCAN)) + return false; + + /* Otherwise, assume part scanning is on, we have no further checks available. Assume the best. */ + return true; } static int blockdev_is_encrypted(const char *sysfs_path, unsigned depth_left) { diff --git a/src/shared/bpf-dlopen.c b/src/shared/bpf-dlopen.c index 15301ae..f00dbea 100644 --- a/src/shared/bpf-dlopen.c +++ b/src/shared/bpf-dlopen.c @@ -74,18 +74,23 @@ int dlopen_bpf(void) { return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "neither libbpf.so.1 nor libbpf.so.0 are installed: %s", dlerror()); + log_debug("Loaded 'libbpf.so.0' via dlopen()"); + /* symbols deprecated in 1.0 we use as compat */ r = dlsym_many_or_warn( dl, LOG_DEBUG, #if MODERN_LIBBPF /* Don't exist anymore in new libbpf, hence cannot type check them */ DLSYM_ARG_FORCE(bpf_create_map), - DLSYM_ARG_FORCE(bpf_probe_prog_type)); + DLSYM_ARG_FORCE(bpf_probe_prog_type) #else DLSYM_ARG(bpf_create_map), - DLSYM_ARG(bpf_probe_prog_type)); + DLSYM_ARG(bpf_probe_prog_type) #endif + ); } else { + log_debug("Loaded 'libbpf.so.1' via dlopen()"); + /* symbols available from 0.7.0 */ r = dlsym_many_or_warn( dl, LOG_DEBUG, @@ -99,6 +104,8 @@ int dlopen_bpf(void) { #endif ); } + if (r < 0) + return r; r = dlsym_many_or_warn( dl, LOG_DEBUG, diff --git a/src/shared/btrfs-util.c b/src/shared/btrfs-util.c index b3e4b50..2ed6bf2 100644 --- a/src/shared/btrfs-util.c +++ b/src/shared/btrfs-util.c @@ -65,7 +65,7 @@ int btrfs_subvol_set_read_only_at(int dir_fd, const char *path, bool b) { assert(dir_fd >= 0 || dir_fd == AT_FDCWD); - fd = xopenat(dir_fd, path, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY, /* xopen_flags = */ 0, /* mode = */ 0); + fd = xopenat(dir_fd, path, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY); if (fd < 0) return fd; @@ -113,7 +113,7 @@ int btrfs_get_block_device_at(int dir_fd, const char *path, dev_t *ret) { assert(path); assert(ret); - fd = xopenat(dir_fd, path, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY, /* xopen_flags = */ 0, /* mode = */ 0); + fd = xopenat(dir_fd, path, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); if (fd < 0) return fd; @@ -1276,8 +1276,6 @@ static int subvol_snapshot_children( if (FLAGS_SET(flags, BTRFS_SNAPSHOT_LOCK_BSD)) { subvolume_fd = xopenat_lock(new_fd, subvolume, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW, - /* xopen_flags = */ 0, - /* mode = */ 0, LOCK_BSD, LOCK_EX); if (subvolume_fd < 0) @@ -1445,7 +1443,7 @@ int btrfs_subvol_snapshot_at_full( assert(dir_fdt >= 0 || dir_fdt == AT_FDCWD); assert(to); - old_fd = xopenat(dir_fdf, from, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY, /* xopen_flags = */ 0, /* mode = */ 0); + old_fd = xopenat(dir_fdf, from, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY); if (old_fd < 0) return old_fd; @@ -1482,8 +1480,6 @@ int btrfs_subvol_snapshot_at_full( if (FLAGS_SET(flags, BTRFS_SNAPSHOT_LOCK_BSD)) { subvolume_fd = xopenat_lock(new_fd, subvolume, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW, - /* xopen_flags = */ 0, - /* mode = */ 0, LOCK_BSD, LOCK_EX); if (subvolume_fd < 0) diff --git a/src/shared/copy.c b/src/shared/copy.c index c0e30cd..2b87cba 100644 --- a/src/shared/copy.c +++ b/src/shared/copy.c @@ -208,6 +208,7 @@ int copy_bytes_full( r = reflink_range(fdf, foffset, fdt, toffset, max_bytes == UINT64_MAX ? 0 : max_bytes); /* partial reflink */ if (r >= 0) { off_t t; + int ret; /* This worked, yay! Now — to be fully correct — let's adjust the file pointers */ if (max_bytes == UINT64_MAX) { @@ -226,7 +227,14 @@ int copy_bytes_full( if (t < 0) return -errno; - return 0; /* we copied the whole thing, hence hit EOF, return 0 */ + if (FLAGS_SET(copy_flags, COPY_VERIFY_LINKED)) { + r = fd_verify_linked(fdf); + if (r < 0) + return r; + } + + /* We copied the whole thing, hence hit EOF, return 0. */ + ret = 0; } else { t = lseek(fdf, foffset + max_bytes, SEEK_SET); if (t < 0) @@ -236,8 +244,18 @@ int copy_bytes_full( if (t < 0) return -errno; - return 1; /* we copied only some number of bytes, which worked, but this means we didn't hit EOF, return 1 */ + /* We copied only some number of bytes, which worked, but + * this means we didn't hit EOF, return 1. */ + ret = 1; + } + + if (FLAGS_SET(copy_flags, COPY_VERIFY_LINKED)) { + r = fd_verify_linked(fdf); + if (r < 0) + return r; } + + return ret; } } } @@ -316,7 +334,7 @@ int copy_bytes_full( if (try_cfr) { n = try_copy_file_range(fdf, NULL, fdt, NULL, m, 0u); if (n < 0) { - if (!IN_SET(n, -EINVAL, -ENOSYS, -EXDEV, -EBADF)) + if (!IN_SET(n, -EINVAL, -ENOSYS, -EXDEV, -EBADF, -EOPNOTSUPP)) return n; try_cfr = false; @@ -483,6 +501,12 @@ int copy_bytes_full( copied_something = true; } + if (FLAGS_SET(copy_flags, COPY_VERIFY_LINKED)) { + r = fd_verify_linked(fdf); + if (r < 0) + return r; + } + if (copy_flags & COPY_TRUNCATE) { off_t off = lseek(fdt, 0, SEEK_CUR); if (off < 0) @@ -508,7 +532,6 @@ static int fd_copy_symlink( _cleanup_free_ char *target = NULL; int r; - assert(from); assert(st); assert(to); @@ -526,7 +549,10 @@ static int fd_copy_symlink( mac_selinux_create_file_clear(); if (r < 0) { if (FLAGS_SET(copy_flags, COPY_GRACEFUL_WARN) && (ERRNO_IS_PRIVILEGE(r) || ERRNO_IS_NOT_SUPPORTED(r))) { - log_notice_errno(r, "Failed to copy symlink '%s', ignoring: %m", from); + log_notice_errno(r, "Failed to copy symlink%s%s%s, ignoring: %m", + isempty(from) ? "" : " '", + strempty(from), + isempty(from) ? "" : "'"); return 0; } @@ -757,7 +783,6 @@ static int fd_copy_regular( _cleanup_close_ int fdf = -EBADF, fdt = -EBADF; int r, q; - assert(from); assert(st); assert(to); @@ -767,9 +792,9 @@ static int fd_copy_regular( if (r > 0) /* worked! */ return 0; - fdf = openat(df, from, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW); + fdf = xopenat(df, from, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW); if (fdf < 0) - return -errno; + return fdf; if (copy_flags & COPY_MAC_CREATE) { r = mac_selinux_create_file_prepare_at(dt, to, S_IFREG); @@ -797,6 +822,12 @@ static int fd_copy_regular( (void) futimens(fdt, (struct timespec[]) { st->st_atim, st->st_mtim }); (void) copy_xattr(fdf, NULL, fdt, NULL, copy_flags); + if (FLAGS_SET(copy_flags, COPY_VERIFY_LINKED)) { + r = fd_verify_linked(fdf); + if (r < 0) + return r; + } + if (copy_flags & COPY_FSYNC) { if (fsync(fdt) < 0) { r = -errno; @@ -830,7 +861,6 @@ static int fd_copy_fifo( HardlinkContext *hardlink_context) { int r; - assert(from); assert(st); assert(to); @@ -849,7 +879,10 @@ static int fd_copy_fifo( if (copy_flags & COPY_MAC_CREATE) mac_selinux_create_file_clear(); if (FLAGS_SET(copy_flags, COPY_GRACEFUL_WARN) && (ERRNO_IS_NEG_PRIVILEGE(r) || ERRNO_IS_NEG_NOT_SUPPORTED(r))) { - log_notice_errno(r, "Failed to copy fifo '%s', ignoring: %m", from); + log_notice_errno(r, "Failed to copy fifo%s%s%s, ignoring: %m", + isempty(from) ? "" : " '", + strempty(from), + isempty(from) ? "" : "'"); return 0; } else if (r < 0) return r; @@ -881,7 +914,6 @@ static int fd_copy_node( HardlinkContext *hardlink_context) { int r; - assert(from); assert(st); assert(to); @@ -900,7 +932,10 @@ static int fd_copy_node( if (copy_flags & COPY_MAC_CREATE) mac_selinux_create_file_clear(); if (FLAGS_SET(copy_flags, COPY_GRACEFUL_WARN) && (ERRNO_IS_NEG_PRIVILEGE(r) || ERRNO_IS_NEG_NOT_SUPPORTED(r))) { - log_notice_errno(r, "Failed to copy node '%s', ignoring: %m", from); + log_notice_errno(r, "Failed to copy node%s%s%s, ignoring: %m", + isempty(from) ? "" : " '", + strempty(from), + isempty(from) ? "" : "'"); return 0; } else if (r < 0) return r; @@ -955,12 +990,9 @@ static int fd_copy_directory( if (depth_left == 0) return -ENAMETOOLONG; - if (from) - fdf = openat(df, from, O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW); - else - fdf = fcntl(df, F_DUPFD_CLOEXEC, 3); + fdf = xopenat(df, from, O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW); if (fdf < 0) - return -errno; + return fdf; if (!hardlink_context) { /* If recreating hardlinks is requested let's set up a context for that now. */ @@ -984,19 +1016,19 @@ static int fd_copy_directory( exists = r >= 0; - fdt = xopenat_lock(dt, to, - O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW|(exists ? 0 : O_CREAT|O_EXCL), - (copy_flags & COPY_MAC_CREATE ? XO_LABEL : 0)|(set_contains(subvolumes, st) ? XO_SUBVOLUME : 0), - st->st_mode & 07777, - copy_flags & COPY_LOCK_BSD ? LOCK_BSD : LOCK_NONE, - LOCK_EX); + fdt = xopenat_lock_full(dt, to, + O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW|(exists ? 0 : O_CREAT|O_EXCL), + (copy_flags & COPY_MAC_CREATE ? XO_LABEL : 0)|(set_contains(subvolumes, st) ? XO_SUBVOLUME : 0), + st->st_mode & 07777, + copy_flags & COPY_LOCK_BSD ? LOCK_BSD : LOCK_NONE, + LOCK_EX); if (fdt < 0) return fdt; r = 0; if (PTR_TO_INT(hashmap_get(denylist, st)) == DENY_CONTENTS) { - log_debug("%s is in the denylist, not recursing", from); + log_debug("%s is in the denylist, not recursing", from ?: "file to copy"); goto finish; } @@ -1030,7 +1062,8 @@ static int fd_copy_directory( } if (PTR_TO_INT(hashmap_get(denylist, &buf)) == DENY_INODE) { - log_debug("%s/%s is in the denylist, ignoring", from, de->d_name); + log_debug("%s%s%s is in the denylist, ignoring", + strempty(from), isempty(from) ? "" : "/", de->d_name); continue; } @@ -1163,10 +1196,10 @@ static int fd_copy_tree_generic( DenyType t = PTR_TO_INT(hashmap_get(denylist, st)); if (t == DENY_INODE) { - log_debug("%s is in the denylist, ignoring", from); + log_debug("%s is in the denylist, ignoring", from ?: "file to copy"); return 0; } else if (t == DENY_CONTENTS) - log_debug("%s is configured to have its contents excluded, but is not a directory", from); + log_debug("%s is configured to have its contents excluded, but is not a directory", from ?: "file to copy"); r = fd_copy_leaf(df, from, st, dt, to, override_uid, override_gid, copy_flags, hardlink_context, display_path, progress_bytes, userdata); /* We just tried to copy a leaf node of the tree. If it failed because the node already exists *and* the COPY_REPLACE flag has been provided, we should unlink the node and re-copy. */ @@ -1198,11 +1231,10 @@ int copy_tree_at_full( struct stat st; int r; - assert(from); assert(to); assert(!FLAGS_SET(copy_flags, COPY_LOCK_BSD)); - if (fstatat(fdf, from, &st, AT_SYMLINK_NOFOLLOW) < 0) + if (fstatat(fdf, strempty(from), &st, AT_SYMLINK_NOFOLLOW | (isempty(from) ? AT_EMPTY_PATH : 0)) < 0) return -errno; r = fd_copy_tree_generic(fdf, from, &st, fdt, to, st.st_dev, COPY_DEPTH_MAX, override_uid, @@ -1305,13 +1337,12 @@ int copy_file_fd_at_full( int r; assert(dir_fdf >= 0 || dir_fdf == AT_FDCWD); - assert(from); assert(fdt >= 0); assert(!FLAGS_SET(copy_flags, COPY_LOCK_BSD)); - fdf = openat(dir_fdf, from, O_RDONLY|O_CLOEXEC|O_NOCTTY); + fdf = xopenat(dir_fdf, from, O_RDONLY|O_CLOEXEC|O_NOCTTY); if (fdf < 0) - return -errno; + return fdf; r = fd_verify_regular(fdf); if (r < 0) @@ -1332,6 +1363,12 @@ int copy_file_fd_at_full( (void) copy_xattr(fdf, NULL, fdt, NULL, copy_flags); } + if (FLAGS_SET(copy_flags, COPY_VERIFY_LINKED)) { + r = fd_verify_linked(fdf); + if (r < 0) + return r; + } + if (copy_flags & COPY_FSYNC_FULL) { r = fsync_full(fdt); if (r < 0) @@ -1363,12 +1400,11 @@ int copy_file_at_full( assert(dir_fdf >= 0 || dir_fdf == AT_FDCWD); assert(dir_fdt >= 0 || dir_fdt == AT_FDCWD); - assert(from); assert(to); - fdf = openat(dir_fdf, from, O_RDONLY|O_CLOEXEC|O_NOCTTY); + fdf = xopenat(dir_fdf, from, O_RDONLY|O_CLOEXEC|O_NOCTTY); if (fdf < 0) - return -errno; + return fdf; if (fstat(fdf, &st) < 0) return -errno; @@ -1378,11 +1414,11 @@ int copy_file_at_full( return r; WITH_UMASK(0000) { - fdt = xopenat_lock(dir_fdt, to, - flags|O_WRONLY|O_CREAT|O_CLOEXEC|O_NOCTTY, - (copy_flags & COPY_MAC_CREATE ? XO_LABEL : 0), - mode != MODE_INVALID ? mode : st.st_mode, - copy_flags & COPY_LOCK_BSD ? LOCK_BSD : LOCK_NONE, LOCK_EX); + fdt = xopenat_lock_full(dir_fdt, to, + flags|O_WRONLY|O_CREAT|O_CLOEXEC|O_NOCTTY, + (copy_flags & COPY_MAC_CREATE ? XO_LABEL : 0), + mode != MODE_INVALID ? mode : st.st_mode, + copy_flags & COPY_LOCK_BSD ? LOCK_BSD : LOCK_NONE, LOCK_EX); if (fdt < 0) return fdt; } @@ -1403,6 +1439,12 @@ int copy_file_at_full( (void) copy_times(fdf, fdt, copy_flags); (void) copy_xattr(fdf, NULL, fdt, NULL, copy_flags); + if (FLAGS_SET(copy_flags, COPY_VERIFY_LINKED)) { + r = fd_verify_linked(fdf); + if (r < 0) + goto fail; + } + if (chattr_mask != 0) (void) chattr_fd(fdt, chattr_flags, chattr_mask & ~CHATTR_EARLY_FL, NULL); @@ -1451,7 +1493,6 @@ int copy_file_atomic_at_full( _cleanup_close_ int fdt = -EBADF; int r; - assert(from); assert(to); assert(!FLAGS_SET(copy_flags, COPY_LOCK_BSD)); diff --git a/src/shared/copy.h b/src/shared/copy.h index d842edd..b8fb28a 100644 --- a/src/shared/copy.h +++ b/src/shared/copy.h @@ -30,6 +30,7 @@ typedef enum CopyFlags { COPY_GRACEFUL_WARN = 1 << 15, /* Skip copying file types that aren't supported by the target filesystem */ COPY_TRUNCATE = 1 << 16, /* Truncate to current file offset after copying */ COPY_LOCK_BSD = 1 << 17, /* Return a BSD exclusively locked file descriptor referring to the copied image/directory. */ + COPY_VERIFY_LINKED = 1 << 18, /* Check the source file is still linked after copying. */ } CopyFlags; typedef enum DenyType { diff --git a/src/shared/creds-util.c b/src/shared/creds-util.c index 7cc8889..fa8ebe0 100644 --- a/src/shared/creds-util.c +++ b/src/shared/creds-util.c @@ -826,9 +826,9 @@ int encrypt_credential_and_warn( tpm2_pubkey_pcr_mask = 0; _cleanup_(tpm2_context_unrefp) Tpm2Context *tpm2_context = NULL; - r = tpm2_context_new(tpm2_device, &tpm2_context); + r = tpm2_context_new_or_warn(tpm2_device, &tpm2_context); if (r < 0) - return log_error_errno(r, "Failed to create TPM2 context: %m"); + return r; r = tpm2_get_best_pcr_bank(tpm2_context, tpm2_hash_pcr_mask | tpm2_pubkey_pcr_mask, &tpm2_pcr_bank); if (r < 0) diff --git a/src/shared/data-fd-util.h b/src/shared/data-fd-util.h index 4f3d8b8..6d99209 100644 --- a/src/shared/data-fd-util.h +++ b/src/shared/data-fd-util.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ #pragma once -#include +#include enum { ACQUIRE_NO_DEV_NULL = 1 << 0, diff --git a/src/shared/dlfcn-util.c b/src/shared/dlfcn-util.c index a321df3..8022f55 100644 --- a/src/shared/dlfcn-util.c +++ b/src/shared/dlfcn-util.c @@ -49,6 +49,8 @@ int dlopen_many_sym_or_warn_sentinel(void **dlp, const char *filename, int log_l return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "%s is not installed: %s", filename, dlerror()); + log_debug("Loaded '%s' via dlopen()", filename); + va_list ap; va_start(ap, log_level); r = dlsym_many_or_warnv(dl, log_level, ap); diff --git a/src/shared/hibernate-util.c b/src/shared/hibernate-util.c index 0d215e8..c3991cf 100644 --- a/src/shared/hibernate-util.c +++ b/src/shared/hibernate-util.c @@ -23,6 +23,7 @@ #include "log.h" #include "parse-util.h" #include "path-util.h" +#include "proc-cmdline.h" #include "stat-util.h" #include "string-util.h" #include "strv.h" @@ -129,6 +130,13 @@ static int read_resume_config(dev_t *ret_devno, uint64_t *ret_offset) { assert(ret_devno); assert(ret_offset); + r = proc_cmdline_get_key("noresume", /* flags = */ 0, /* ret_value = */ NULL); + if (r < 0) + return log_debug_errno(r, "Failed to check if 'noresume' kernel command line option is set: %m"); + if (r > 0) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "'noresume' kernel command line option is set, refusing hibernation device lookup."); + r = read_one_line_file("/sys/power/resume", &devno_str); if (r < 0) return log_debug_errno(r, "Failed to read /sys/power/resume: %m"); diff --git a/src/shared/idn-util.c b/src/shared/idn-util.c index 6f36688..26a9d60 100644 --- a/src/shared/idn-util.c +++ b/src/shared/idn-util.c @@ -50,7 +50,10 @@ int dlopen_idn(void) { if (!dl) return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "libidn support is not installed: %s", dlerror()); - } + log_debug("Loaded 'libidn.so.11' via dlopen()"); + } else + log_debug("Loaded 'libidn.so.12' via dlopen()"); + r = dlsym_many_or_warn( dl, diff --git a/src/shared/install.c b/src/shared/install.c index 0f4dab4..27a421e 100644 --- a/src/shared/install.c +++ b/src/shared/install.c @@ -340,9 +340,12 @@ void install_changes_dump(int r, const char *verb, const InstallChange *changes, assert(verb || r >= 0); for (size_t i = 0; i < n_changes; i++) { - if (changes[i].type < 0) - assert(verb); assert(changes[i].path); + /* This tries to tell the compiler that it's safe to use 'verb' in a string format if there + * was an error, but the compiler doesn't care and fails anyway, so strna(verb) is used + * too. */ + assert(verb || changes[i].type >= 0); + verb = strna(verb); /* When making changes here, make sure to also change install_error() in dbus-manager.c. */ diff --git a/src/shared/journal-file-util.c b/src/shared/journal-file-util.c index e444a2b..bdceac4 100644 --- a/src/shared/journal-file-util.c +++ b/src/shared/journal-file-util.c @@ -210,11 +210,16 @@ static void journal_file_set_offline_internal(JournalFile *f) { log_debug_errno(r, "Failed to re-enable copy-on-write for %s: %m, rewriting file", f->path); - r = copy_file_atomic_full(FORMAT_PROC_FD_PATH(f->fd), f->path, f->mode, - 0, - FS_NOCOW_FL, - COPY_REPLACE | COPY_FSYNC | COPY_HOLES | COPY_ALL_XATTRS, - NULL, NULL); + /* Here, setting COPY_VERIFY_LINKED flag is crucial. Otherwise, a broken + * journal file may be created, if journal_directory_vacuum() -> + * unlinkat_deallocate() is called in the main thread while this thread is + * copying the file. See issue #24150 and #31222. */ + r = copy_file_atomic_at_full( + f->fd, NULL, AT_FDCWD, f->path, f->mode, + 0, + FS_NOCOW_FL, + COPY_REPLACE | COPY_FSYNC | COPY_HOLES | COPY_ALL_XATTRS | COPY_VERIFY_LINKED, + NULL, NULL); if (r < 0) { log_debug_errno(r, "Failed to rewrite %s: %m", f->path); continue; diff --git a/src/shared/logs-show.c b/src/shared/logs-show.c index a5d0400..0a31be3 100644 --- a/src/shared/logs-show.c +++ b/src/shared/logs-show.c @@ -2088,7 +2088,7 @@ int journal_get_boots(sd_journal *j, BootId **ret_boots, size_t *ret_n_boots) { if (sd_id128_equal(i->id, boot.id)) /* The boot id is already stored, something wrong with the journal files. * Exiting as otherwise this problem would cause an infinite loop. */ - break; + goto finish; if (!GREEDY_REALLOC(boots, n_boots + 1)) return -ENOMEM; @@ -2096,6 +2096,7 @@ int journal_get_boots(sd_journal *j, BootId **ret_boots, size_t *ret_n_boots) { boots[n_boots++] = boot; } + finish: *ret_boots = TAKE_PTR(boots); *ret_n_boots = n_boots; return n_boots > 0; diff --git a/src/shared/loop-util.c b/src/shared/loop-util.c index 5860303..6d55df7 100644 --- a/src/shared/loop-util.c +++ b/src/shared/loop-util.c @@ -702,9 +702,9 @@ int loop_device_make_by_path_at( direct_flags = FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO) ? O_DIRECT : 0; rdwr_flags = open_flags >= 0 ? open_flags : O_RDWR; - fd = xopenat(dir_fd, path, basic_flags|direct_flags|rdwr_flags, /* xopen_flags = */ 0, /* mode = */ 0); + fd = xopenat(dir_fd, path, basic_flags|direct_flags|rdwr_flags); if (fd < 0 && direct_flags != 0) /* If we had O_DIRECT on, and things failed with that, let's immediately try again without */ - fd = xopenat(dir_fd, path, basic_flags|rdwr_flags, /* xopen_flags = */ 0, /* mode = */ 0); + fd = xopenat(dir_fd, path, basic_flags|rdwr_flags); else direct = direct_flags != 0; if (fd < 0) { @@ -714,9 +714,9 @@ int loop_device_make_by_path_at( if (open_flags >= 0 || !(ERRNO_IS_PRIVILEGE(r) || r == -EROFS)) return r; - fd = xopenat(dir_fd, path, basic_flags|direct_flags|O_RDONLY, /* xopen_flags = */ 0, /* mode = */ 0); + fd = xopenat(dir_fd, path, basic_flags|direct_flags|O_RDONLY); if (fd < 0 && direct_flags != 0) /* as above */ - fd = xopenat(dir_fd, path, basic_flags|O_RDONLY, /* xopen_flags = */ 0, /* mode = */ 0); + fd = xopenat(dir_fd, path, basic_flags|O_RDONLY); else direct = direct_flags != 0; if (fd < 0) diff --git a/src/shared/open-file.c b/src/shared/open-file.c index 42772bd..7d7a8a9 100644 --- a/src/shared/open-file.c +++ b/src/shared/open-file.c @@ -96,7 +96,7 @@ int open_file_to_string(const OpenFile *of, char **ret) { assert(of); assert(ret); - s = shell_escape(of->path, ":"); + s = xescape(of->path, ":"); if (!s) return -ENOMEM; diff --git a/src/shared/serialize.c b/src/shared/serialize.c index 483cbc7..344b102 100644 --- a/src/shared/serialize.c +++ b/src/shared/serialize.c @@ -180,7 +180,7 @@ int serialize_strv(FILE *f, const char *key, char **l) { } int serialize_pidref(FILE *f, FDSet *fds, const char *key, PidRef *pidref) { - int copy; + int r; assert(f); assert(fds); @@ -188,17 +188,23 @@ int serialize_pidref(FILE *f, FDSet *fds, const char *key, PidRef *pidref) { if (!pidref_is_set(pidref)) return 0; - /* If we have a pidfd we serialize the fd and encode the fd number prefixed by "@" in the - * serialization. Otherwise we serialize the numeric PID as it is. */ + /* We always serialize the pid separately, to keep downgrades mostly working (older versions will + * deserialize the pid and silently fail to deserialize the pidfd). If we also have a pidfd, we + * serialize both the pid and pidfd, so that we can construct the exact same pidref after + * deserialization (this doesn't work with only the pidfd, as we can't retrieve the original pid + * from the pidfd anymore if the process is reaped). */ - if (pidref->fd < 0) - return serialize_item_format(f, key, PID_FMT, pidref->pid); + if (pidref->fd >= 0) { + int copy = fdset_put_dup(fds, pidref->fd); + if (copy < 0) + return log_error_errno(copy, "Failed to add file descriptor to serialization set: %m"); - copy = fdset_put_dup(fds, pidref->fd); - if (copy < 0) - return log_error_errno(copy, "Failed to add file descriptor to serialization set: %m"); + r = serialize_item_format(f, key, "@%i:" PID_FMT, copy, pidref->pid); + if (r < 0) + return r; + } - return serialize_item_format(f, key, "@%i", copy); + return serialize_item_format(f, key, PID_FMT, pidref->pid); } int serialize_ratelimit(FILE *f, const char *key, const RateLimit *rl) { @@ -476,12 +482,39 @@ int deserialize_pidref(FDSet *fds, const char *value, PidRef *ret) { e = startswith(value, "@"); if (e) { - int fd = deserialize_fd(fds, e); + _cleanup_free_ char *fdstr = NULL, *pidstr = NULL; + _cleanup_close_ int fd = -EBADF; + + r = extract_many_words(&e, ":", /* flags = */ 0, &fdstr, &pidstr, NULL); + if (r < 0) + return log_debug_errno(r, "Failed to deserialize pidref '%s': %m", e); + if (r == 0) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot deserialize pidref from empty string."); + + assert(r <= 2); + fd = deserialize_fd(fds, fdstr); if (fd < 0) return fd; - r = pidref_set_pidfd_consume(ret, fd); + /* The serialization format changed after 255.4. In systemd <= 255.4 only pidfd is + * serialized, but that causes problems when reconstructing pidref (see serialize_pidref for + * details). After 255.4 the pid is serialized as well even if we have a pidfd, but we still + * need to support older format as we might be upgrading from a version that still uses the + * old format. */ + if (pidstr) { + pid_t pid; + + r = parse_pid(pidstr, &pid); + if (r < 0) + return log_debug_errno(r, "Failed to parse PID: %s", pidstr); + + *ret = (PidRef) { + .pid = pid, + .fd = TAKE_FD(fd), + }; + } else + r = pidref_set_pidfd_consume(ret, TAKE_FD(fd)); } else { pid_t pid; diff --git a/src/shared/tpm2-util.c b/src/shared/tpm2-util.c index 30b4f57..c7e0b24 100644 --- a/src/shared/tpm2-util.c +++ b/src/shared/tpm2-util.c @@ -664,7 +664,9 @@ int tpm2_context_new(const char *device, Tpm2Context **ret_context) { context->tcti_dl = dlopen(fn, RTLD_NOW); if (!context->tcti_dl) - return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Failed to load %s: %s", fn, dlerror()); + return log_debug_errno(SYNTHETIC_ERRNO(ENOPKG), "Failed to load %s: %s", fn, dlerror()); + + log_debug("Loaded '%s' via dlopen()", fn); func = dlsym(context->tcti_dl, TSS2_TCTI_INFO_SYMBOL); if (!func) @@ -678,7 +680,7 @@ int tpm2_context_new(const char *device, Tpm2Context **ret_context) { log_debug("Loaded TCTI module '%s' (%s) [Version %" PRIu32 "]", info->name, info->description, info->version); - rc = info->init(NULL, &sz, NULL); + rc = info->init(/* context= */ NULL, &sz, /* param= */ NULL); if (rc != TPM2_RC_SUCCESS) return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Failed to initialize TCTI context: %s", sym_Tss2_RC_Decode(rc)); @@ -713,19 +715,37 @@ int tpm2_context_new(const char *device, Tpm2Context **ret_context) { /* We require AES and CFB support for session encryption. */ if (!tpm2_supports_alg(context, TPM2_ALG_AES)) - return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "TPM does not support AES."); + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "TPM does not support AES."); if (!tpm2_supports_alg(context, TPM2_ALG_CFB)) - return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "TPM does not support CFB."); + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "TPM does not support CFB."); if (!tpm2_supports_tpmt_sym_def(context, &SESSION_TEMPLATE_SYM_AES_128_CFB)) - return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "TPM does not support AES-128-CFB."); + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "TPM does not support AES-128-CFB."); *ret_context = TAKE_PTR(context); return 0; } +int tpm2_context_new_or_warn(const char *device, Tpm2Context **ret_context) { + int r; + + assert(ret_context); + + r = tpm2_context_new(device, ret_context); + if (r == -EOPNOTSUPP) + return log_error_errno(r, "TPM device not usable as it does not support the required functionality (AES-128-CFB missing?)."); + if (r == -ENOPKG) + return log_error_errno(r, "TPM TCTI driver not available."); + if (r == -ENOENT) + return log_error_errno(r, "TPM device not found."); + if (r < 0) + return log_error_errno(r, "Failed to create TPM2 context: %m"); + + return 0; +} + static void tpm2_handle_cleanup(ESYS_CONTEXT *esys_context, ESYS_TR esys_handle, bool flush) { TSS2_RC rc; @@ -5540,13 +5560,13 @@ int tpm2_unseal(Tpm2Context *c, if (r < 0) return r; - _cleanup_(tpm2_handle_freep) Tpm2Handle *encryption_session = NULL; - r = tpm2_make_encryption_session(c, primary_handle, hmac_key, &encryption_session); - if (r < 0) - return r; - _cleanup_(Esys_Freep) TPM2B_SENSITIVE_DATA* unsealed = NULL; for (unsigned i = RETRY_UNSEAL_MAX;; i--) { + _cleanup_(tpm2_handle_freep) Tpm2Handle *encryption_session = NULL; + r = tpm2_make_encryption_session(c, primary_handle, hmac_key, &encryption_session); + if (r < 0) + return r; + _cleanup_(tpm2_handle_freep) Tpm2Handle *policy_session = NULL; _cleanup_(Esys_Freep) TPM2B_DIGEST *policy_digest = NULL; r = tpm2_make_policy_session( diff --git a/src/shared/tpm2-util.h b/src/shared/tpm2-util.h index 55d7481..911a3c7 100644 --- a/src/shared/tpm2-util.h +++ b/src/shared/tpm2-util.h @@ -72,6 +72,7 @@ typedef struct { } Tpm2Context; int tpm2_context_new(const char *device, Tpm2Context **ret_context); +int tpm2_context_new_or_warn(const char *device, Tpm2Context **ret_context); Tpm2Context *tpm2_context_ref(Tpm2Context *context); Tpm2Context *tpm2_context_unref(Tpm2Context *context); DEFINE_TRIVIAL_CLEANUP_FUNC(Tpm2Context*, tpm2_context_unref); diff --git a/src/shared/verbs.c b/src/shared/verbs.c index a010952..a38591d 100644 --- a/src/shared/verbs.c +++ b/src/shared/verbs.c @@ -13,22 +13,21 @@ #include "verbs.h" #include "virt.h" -/* Wraps running_in_chroot() which is used in various places, but also adds an environment variable check so external - * processes can reliably force this on. - */ +/* Wraps running_in_chroot() which is used in various places, but also adds an environment variable check + * so external processes can reliably force this on. */ bool running_in_chroot_or_offline(void) { int r; - /* Added to support use cases like rpm-ostree, where from %post scripts we only want to execute "preset", but - * not "start"/"restart" for example. + /* Added to support use cases like rpm-ostree, where from %post scripts we only want to execute "preset", + * but not "start"/"restart" for example. * * See docs/ENVIRONMENT.md for docs. */ r = getenv_bool("SYSTEMD_OFFLINE"); - if (r < 0 && r != -ENXIO) - log_debug_errno(r, "Failed to parse $SYSTEMD_OFFLINE: %m"); - else if (r >= 0) + if (r >= 0) return r > 0; + if (r != -ENXIO) + log_debug_errno(r, "Failed to parse $SYSTEMD_OFFLINE, ignoring: %m"); /* We've had this condition check for a long time which basically checks for legacy chroot case like Fedora's * "mock", which is used for package builds. We don't want to try to start systemd services there, since @@ -40,8 +39,7 @@ bool running_in_chroot_or_offline(void) { */ r = running_in_chroot(); if (r < 0) - log_debug_errno(r, "running_in_chroot(): %m"); - + log_debug_errno(r, "Failed to check if we're running in chroot, assuming not: %m"); return r > 0; } @@ -145,6 +143,17 @@ int dispatch_verb(int argc, char *argv[], const Verb verbs[], void *userdata) { return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown command verb '%s'.", name); } + _cleanup_free_ char *verb_list = NULL; + size_t i; + + for (i = 0; verbs[i].dispatch; i++) + if (!strextend_with_separator(&verb_list, ", ", verbs[i].verb)) + return log_oom(); + + if (i > 2) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Command verb required (one of %s).", verb_list); + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Command verb required."); } diff --git a/src/shared/watchdog.c b/src/shared/watchdog.c index 2d79f71..99ccefb 100644 --- a/src/shared/watchdog.c +++ b/src/shared/watchdog.c @@ -95,7 +95,7 @@ static int set_pretimeout_governor(const char *governor) { governor, WRITE_STRING_FILE_DISABLE_BUFFER | WRITE_STRING_FILE_VERIFY_ON_FAILURE | WRITE_STRING_FILE_VERIFY_IGNORE_NEWLINE); if (r < 0) - return log_error_errno(r, "Failed to set pretimeout_governor to '%s': %m", governor); + return log_error_errno(r, "Failed to set watchdog pretimeout_governor to '%s': %m", governor); return r; } @@ -157,7 +157,7 @@ static int watchdog_read_pretimeout(void) { if (ioctl(watchdog_fd, WDIOC_GETPRETIMEOUT, &sec) < 0) { watchdog_pretimeout = 0; - return log_full_errno(ERRNO_IS_NOT_SUPPORTED(errno) ? LOG_DEBUG : LOG_WARNING, errno, "Failed to get pretimeout value, ignoring: %m"); + return log_full_errno(ERRNO_IS_NOT_SUPPORTED(errno) ? LOG_DEBUG : LOG_WARNING, errno, "Failed to get watchdog pretimeout value, ignoring: %m"); } watchdog_pretimeout = sec * USEC_PER_SEC; @@ -181,7 +181,7 @@ static int watchdog_set_pretimeout(void) { return 0; } - return log_error_errno(errno, "Failed to set pretimeout to %s: %m", FORMAT_TIMESPAN(sec, USEC_PER_SEC)); + return log_error_errno(errno, "Failed to set watchdog pretimeout to %s: %m", FORMAT_TIMESPAN(sec, USEC_PER_SEC)); } /* The set ioctl does not return the actual value set so get it now. */ @@ -274,10 +274,10 @@ static int update_timeout(void) { r = watchdog_set_timeout(); if (r < 0) { if (!ERRNO_IS_NOT_SUPPORTED(r)) - return log_error_errno(r, "Failed to set timeout to %s: %m", + return log_error_errno(r, "Failed to set watchdog hardware timeout to %s: %m", FORMAT_TIMESPAN(watchdog_timeout, 0)); - log_info("Modifying watchdog timeout is not supported, reusing the programmed timeout."); + log_info("Modifying watchdog hardware timeout is not supported, reusing the programmed timeout."); watchdog_timeout = USEC_INFINITY; } } @@ -286,8 +286,8 @@ static int update_timeout(void) { r = watchdog_read_timeout(); if (r < 0) { if (!ERRNO_IS_NOT_SUPPORTED(r)) - return log_error_errno(r, "Failed to query watchdog HW timeout: %m"); - log_info("Reading watchdog timeout is not supported, reusing the configured timeout."); + return log_error_errno(r, "Failed to query watchdog hardware timeout: %m"); + log_info("Reading watchdog hardware timeout is not supported, reusing the configured timeout."); watchdog_timeout = previous_timeout; } } @@ -302,7 +302,7 @@ static int update_timeout(void) { if (r < 0) return r; - log_info("Watchdog running with a timeout of %s.", FORMAT_TIMESPAN(watchdog_timeout, 0)); + log_info("Watchdog running with a hardware timeout of %s.", FORMAT_TIMESPAN(watchdog_timeout, 0)); return watchdog_ping_now(); } diff --git a/src/systemctl/systemctl-logind.c b/src/systemctl/systemctl-logind.c index 268e528..7f97325 100644 --- a/src/systemctl/systemctl-logind.c +++ b/src/systemctl/systemctl-logind.c @@ -392,7 +392,7 @@ int logind_show_shutdown(void) { return r; if (isempty(action)) - return log_error_errno(SYNTHETIC_ERRNO(ENODATA), "No scheduled shutdown."); + return log_full_errno(arg_quiet ? LOG_DEBUG : LOG_ERR, SYNTHETIC_ERRNO(ENODATA), "No scheduled shutdown."); if (STR_IN_SET(action, "halt", "poweroff", "exit")) pretty_action = "Shutdown"; diff --git a/src/systemctl/systemctl-show.c b/src/systemctl/systemctl-show.c index e7fabcf..5d1eb49 100644 --- a/src/systemctl/systemctl-show.c +++ b/src/systemctl/systemctl-show.c @@ -2255,7 +2255,7 @@ static int get_unit_dbus_path_by_pid( * sends the numeric PID. */ pidfd = pidfd_open(pid, 0); - if (pidfd < 0 && ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) + if (pidfd < 0 && (ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno))) return get_unit_dbus_path_by_pid_fallback(bus, pid, ret_path, ret_unit); if (pidfd < 0) return log_error_errno(errno, "Failed to open PID %"PRIu32": %m", pid); diff --git a/src/systemd/sd-bus-vtable.h b/src/systemd/sd-bus-vtable.h index 5e80ea8..d06c5c3 100644 --- a/src/systemd/sd-bus-vtable.h +++ b/src/systemd/sd-bus-vtable.h @@ -208,6 +208,7 @@ struct sd_bus_vtable { _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, \ NAME, ...) NAME +#define _SD_VARARGS_FOREACH_EVEN_00(FN) #define _SD_VARARGS_FOREACH_EVEN_01(FN, X) FN(X) #define _SD_VARARGS_FOREACH_EVEN_02(FN, X, Y) FN(X) #define _SD_VARARGS_FOREACH_EVEN_04(FN, X, Y, ...) FN(X) _SD_VARARGS_FOREACH_EVEN_02(FN, __VA_ARGS__) @@ -261,9 +262,11 @@ struct sd_bus_vtable { _SD_VARARGS_FOREACH_EVEN_08, _SD_VARARGS_FOREACH_EVEN_07, \ _SD_VARARGS_FOREACH_EVEN_06, _SD_VARARGS_FOREACH_EVEN_05, \ _SD_VARARGS_FOREACH_EVEN_04, _SD_VARARGS_FOREACH_EVEN_03, \ - _SD_VARARGS_FOREACH_EVEN_02, _SD_VARARGS_FOREACH_EVEN_01) \ + _SD_VARARGS_FOREACH_EVEN_02, _SD_VARARGS_FOREACH_EVEN_01, \ + _SD_VARARGS_FOREACH_EVEN_00) \ (FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_ODD_00(FN) #define _SD_VARARGS_FOREACH_ODD_01(FN, X) #define _SD_VARARGS_FOREACH_ODD_02(FN, X, Y) FN(Y) #define _SD_VARARGS_FOREACH_ODD_04(FN, X, Y, ...) FN(Y) _SD_VARARGS_FOREACH_ODD_02(FN, __VA_ARGS__) @@ -317,7 +320,8 @@ struct sd_bus_vtable { _SD_VARARGS_FOREACH_ODD_08, _SD_VARARGS_FOREACH_ODD_07, \ _SD_VARARGS_FOREACH_ODD_06, _SD_VARARGS_FOREACH_ODD_05, \ _SD_VARARGS_FOREACH_ODD_04, _SD_VARARGS_FOREACH_ODD_03, \ - _SD_VARARGS_FOREACH_ODD_02, _SD_VARARGS_FOREACH_ODD_01) \ + _SD_VARARGS_FOREACH_ODD_02, _SD_VARARGS_FOREACH_ODD_01, \ + _SD_VARARGS_FOREACH_ODD_00) \ (FN, __VA_ARGS__) #define SD_BUS_ARGS(...) __VA_ARGS__ diff --git a/src/test/test-btrfs.c b/src/test/test-btrfs.c index 205142e..6dff709 100644 --- a/src/test/test-btrfs.c +++ b/src/test/test-btrfs.c @@ -71,7 +71,7 @@ int main(int argc, char *argv[]) { if (r < 0) log_error_errno(r, "Failed to make snapshot: %m"); if (r >= 0) - assert_se(xopenat_lock(AT_FDCWD, "/xxxtest4", 0, 0, 0, LOCK_BSD, LOCK_EX|LOCK_NB) == -EAGAIN); + assert_se(xopenat_lock(AT_FDCWD, "/xxxtest4", 0, LOCK_BSD, LOCK_EX|LOCK_NB) == -EAGAIN); safe_close(r); diff --git a/src/test/test-copy.c b/src/test/test-copy.c index f3144f0..9674e78 100644 --- a/src/test/test-copy.c +++ b/src/test/test-copy.c @@ -520,13 +520,35 @@ TEST(copy_lock) { assert_se((fd = copy_directory_at(tfd, "abc", tfd, "qed", COPY_LOCK_BSD)) >= 0); assert_se(faccessat(tfd, "qed", F_OK, 0) >= 0); assert_se(faccessat(tfd, "qed/def", F_OK, 0) >= 0); - assert_se(xopenat_lock(tfd, "qed", 0, 0, 0, LOCK_BSD, LOCK_EX|LOCK_NB) == -EAGAIN); + assert_se(xopenat_lock(tfd, "qed", 0, LOCK_BSD, LOCK_EX|LOCK_NB) == -EAGAIN); fd = safe_close(fd); assert_se((fd = copy_file_at(tfd, "abc/def", tfd, "poi", 0, 0644, COPY_LOCK_BSD))); assert_se(read_file_at_and_streq(tfd, "poi", "abc\n")); - assert_se(xopenat_lock(tfd, "poi", 0, 0, 0, LOCK_BSD, LOCK_EX|LOCK_NB) == -EAGAIN); + assert_se(xopenat_lock(tfd, "poi", 0, LOCK_BSD, LOCK_EX|LOCK_NB) == -EAGAIN); fd = safe_close(fd); } +TEST(copy_verify_linked) { + _cleanup_(rm_rf_physical_and_freep) char *t = NULL; + _cleanup_close_ int tfd = -EBADF, fd_1 = -EBADF, fd_2 = -EBADF; + + tfd = mkdtemp_open(NULL, O_PATH, &t); + assert_se(tfd >= 0); + + assert_se(write_string_file_at(tfd, "hoge", "bar bar", WRITE_STRING_FILE_CREATE) >= 0); + + fd_1 = openat(tfd, "hoge", O_CLOEXEC | O_NOCTTY | O_RDONLY); + assert_se(fd_1 >= 0); + fd_2 = openat(tfd, "hoge", O_CLOEXEC | O_NOCTTY | O_RDONLY); + assert_se(fd_2 >= 0); + assert_se(unlinkat(tfd, "hoge", 0) >= 0); + + assert_se(copy_file_at(fd_1, NULL, tfd, "to_1", 0, 0644, 0) >= 0); + assert_se(read_file_at_and_streq(tfd, "to_1", "bar bar\n")); + + assert_se(copy_file_at(fd_2, NULL, tfd, "to_2", O_EXCL, 0644, COPY_VERIFY_LINKED) == -EIDRM); + assert_se(faccessat(tfd, "to_2", F_OK, AT_SYMLINK_NOFOLLOW) < 0 && errno == ENOENT); +} + DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-fs-util.c b/src/test/test-fs-util.c index ef335b4..b32feff 100644 --- a/src/test/test-fs-util.c +++ b/src/test/test-fs-util.c @@ -673,37 +673,37 @@ TEST(openat_report_new) { assert_se(b); } -TEST(xopenat) { +TEST(xopenat_full) { _cleanup_(rm_rf_physical_and_freep) char *t = NULL; _cleanup_close_ int tfd = -EBADF, fd = -EBADF, fd2 = -EBADF; assert_se((tfd = mkdtemp_open(NULL, 0, &t)) >= 0); - /* Test that xopenat() creates directories if O_DIRECTORY is specified. */ + /* Test that xopenat_full() creates directories if O_DIRECTORY is specified. */ - assert_se((fd = xopenat(tfd, "abc", O_DIRECTORY|O_CREAT|O_EXCL|O_CLOEXEC, 0, 0755)) >= 0); + assert_se((fd = xopenat_full(tfd, "abc", O_DIRECTORY|O_CREAT|O_EXCL|O_CLOEXEC, 0, 0755)) >= 0); assert_se((fd_verify_directory(fd) >= 0)); fd = safe_close(fd); - assert_se(xopenat(tfd, "abc", O_DIRECTORY|O_CREAT|O_EXCL|O_CLOEXEC, 0, 0755) == -EEXIST); + assert_se(xopenat_full(tfd, "abc", O_DIRECTORY|O_CREAT|O_EXCL|O_CLOEXEC, 0, 0755) == -EEXIST); - assert_se((fd = xopenat(tfd, "abc", O_DIRECTORY|O_CREAT|O_CLOEXEC, 0, 0755)) >= 0); + assert_se((fd = xopenat_full(tfd, "abc", O_DIRECTORY|O_CREAT|O_CLOEXEC, 0, 0755)) >= 0); assert_se((fd_verify_directory(fd) >= 0)); fd = safe_close(fd); - /* Test that xopenat() creates regular files if O_DIRECTORY is not specified. */ + /* Test that xopenat_full() creates regular files if O_DIRECTORY is not specified. */ - assert_se((fd = xopenat(tfd, "def", O_CREAT|O_EXCL|O_CLOEXEC, 0, 0644)) >= 0); + assert_se((fd = xopenat_full(tfd, "def", O_CREAT|O_EXCL|O_CLOEXEC, 0, 0644)) >= 0); assert_se(fd_verify_regular(fd) >= 0); fd = safe_close(fd); - /* Test that we can reopen an existing fd with xopenat() by specifying an empty path. */ + /* Test that we can reopen an existing fd with xopenat_full() by specifying an empty path. */ - assert_se((fd = xopenat(tfd, "def", O_PATH|O_CLOEXEC, 0, 0)) >= 0); - assert_se((fd2 = xopenat(fd, "", O_RDWR|O_CLOEXEC, 0, 0644)) >= 0); + assert_se((fd = xopenat_full(tfd, "def", O_PATH|O_CLOEXEC, 0, 0)) >= 0); + assert_se((fd2 = xopenat_full(fd, "", O_RDWR|O_CLOEXEC, 0, 0644)) >= 0); } -TEST(xopenat_lock) { +TEST(xopenat_lock_full) { _cleanup_(rm_rf_physical_and_freep) char *t = NULL; _cleanup_close_ int tfd = -EBADF, fd = -EBADF; siginfo_t si; @@ -714,11 +714,11 @@ TEST(xopenat_lock) { * and close the file descriptor and still properly create the directory and acquire the lock in * another process. */ - fd = xopenat_lock(tfd, "abc", O_CREAT|O_DIRECTORY|O_CLOEXEC, 0, 0755, LOCK_BSD, LOCK_EX); + fd = xopenat_lock_full(tfd, "abc", O_CREAT|O_DIRECTORY|O_CLOEXEC, 0, 0755, LOCK_BSD, LOCK_EX); assert_se(fd >= 0); assert_se(faccessat(tfd, "abc", F_OK, 0) >= 0); assert_se(fd_verify_directory(fd) >= 0); - assert_se(xopenat_lock(tfd, "abc", O_DIRECTORY|O_CLOEXEC, 0, 0755, LOCK_BSD, LOCK_EX|LOCK_NB) == -EAGAIN); + assert_se(xopenat_lock_full(tfd, "abc", O_DIRECTORY|O_CLOEXEC, 0, 0755, LOCK_BSD, LOCK_EX|LOCK_NB) == -EAGAIN); pid_t pid = fork(); assert_se(pid >= 0); @@ -726,21 +726,21 @@ TEST(xopenat_lock) { if (pid == 0) { safe_close(fd); - fd = xopenat_lock(tfd, "abc", O_CREAT|O_DIRECTORY|O_CLOEXEC, 0, 0755, LOCK_BSD, LOCK_EX); + fd = xopenat_lock_full(tfd, "abc", O_CREAT|O_DIRECTORY|O_CLOEXEC, 0, 0755, LOCK_BSD, LOCK_EX); assert_se(fd >= 0); assert_se(faccessat(tfd, "abc", F_OK, 0) >= 0); assert_se(fd_verify_directory(fd) >= 0); - assert_se(xopenat_lock(tfd, "abc", O_DIRECTORY|O_CLOEXEC, 0, 0755, LOCK_BSD, LOCK_EX|LOCK_NB) == -EAGAIN); + assert_se(xopenat_lock_full(tfd, "abc", O_DIRECTORY|O_CLOEXEC, 0, 0755, LOCK_BSD, LOCK_EX|LOCK_NB) == -EAGAIN); _exit(EXIT_SUCCESS); } - /* We need to give the child process some time to get past the xopenat() call in xopenat_lock() and - * block in the call to lock_generic() waiting for the lock to become free. We can't modify - * xopenat_lock() to signal an eventfd to let us know when that has happened, so we just sleep for a - * little and assume that's enough time for the child process to get along far enough. It doesn't - * matter if it doesn't get far enough, in that case we just won't trigger the fallback logic in - * xopenat_lock(), but the test will still succeed. */ + /* We need to give the child process some time to get past the xopenat() call in xopenat_lock_full() + * and block in the call to lock_generic() waiting for the lock to become free. We can't modify + * xopenat_lock_full() to signal an eventfd to let us know when that has happened, so we just sleep + * for a little and assume that's enough time for the child process to get along far enough. It + * doesn't matter if it doesn't get far enough, in that case we just won't trigger the fallback logic + * in xopenat_lock_full(), but the test will still succeed. */ assert_se(usleep_safe(20 * USEC_PER_MSEC) >= 0); assert_se(unlinkat(tfd, "abc", AT_REMOVEDIR) >= 0); @@ -749,8 +749,8 @@ TEST(xopenat_lock) { assert_se(wait_for_terminate(pid, &si) >= 0); assert_se(si.si_code == CLD_EXITED); - assert_se(xopenat_lock(tfd, "abc", 0, 0, 0755, LOCK_POSIX, LOCK_EX) == -EBADF); - assert_se(xopenat_lock(tfd, "def", O_DIRECTORY, 0, 0755, LOCK_POSIX, LOCK_EX) == -EBADF); + assert_se(xopenat_lock_full(tfd, "abc", 0, 0, 0755, LOCK_POSIX, LOCK_EX) == -EBADF); + assert_se(xopenat_lock_full(tfd, "def", O_DIRECTORY, 0, 0755, LOCK_POSIX, LOCK_EX) == -EBADF); } static int intro(void) { diff --git a/src/test/test-open-file.c b/src/test/test-open-file.c index 1b938ec..4314d0d 100644 --- a/src/test/test-open-file.c +++ b/src/test/test-open-file.c @@ -172,14 +172,12 @@ TEST(open_file_to_string) { assert_se(streq(s, "/proc/1/ns/mnt::read-only")); s = mfree(s); - assert_se(free_and_strdup(&of->path, "/path:with:colon")); - assert_se(free_and_strdup(&of->fdname, "path:with:colon")); + assert_se(free_and_strdup(&of->path, "/path:with:colon") >= 0); + assert_se(free_and_strdup(&of->fdname, "path:with:colon") >= 0); of->flags = 0; - r = open_file_to_string(of, &s); - - assert_se(r >= 0); - assert_se(streq(s, "/path\\:with\\:colon")); + assert_se(open_file_to_string(of, &s) >= 0); + assert_se(streq(s, "/path\\x3awith\\x3acolon")); } DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-stat-util.c b/src/test/test-stat-util.c index 5aca207..8d7fd5b 100644 --- a/src/test/test-stat-util.c +++ b/src/test/test-stat-util.c @@ -180,6 +180,25 @@ TEST(dir_is_empty) { assert_se(dir_is_empty_at(AT_FDCWD, empty_dir, /* ignore_hidden_or_backup= */ false) > 0); } +TEST(fd_verify_linked) { + _cleanup_(rm_rf_physical_and_freep) char *t = NULL; + _cleanup_close_ int tfd = -EBADF, fd = -EBADF; + _cleanup_free_ char *p = NULL; + + tfd = mkdtemp_open(NULL, O_PATH, &t); + assert_se(tfd >= 0); + + assert_se(p = path_join(t, "hoge")); + assert_se(touch(p) >= 0); + + fd = open(p, O_CLOEXEC | O_PATH); + assert_se(fd >= 0); + + assert_se(fd_verify_linked(fd) >= 0); + assert_se(unlinkat(tfd, "hoge", 0) >= 0); + assert_se(fd_verify_linked(fd) == -EIDRM); +} + static int intro(void) { log_show_color(true); return EXIT_SUCCESS; diff --git a/src/tmpfiles/tmpfiles.c b/src/tmpfiles/tmpfiles.c index bc83aab..4919cb7 100644 --- a/src/tmpfiles/tmpfiles.c +++ b/src/tmpfiles/tmpfiles.c @@ -817,7 +817,7 @@ static int dir_cleanup( cutoff_nsec, sub_path, age_by_file, false)) continue; - fd = xopenat(dirfd(d), + fd = xopenat_full(dirfd(d), de->d_name, O_RDONLY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME|O_NONBLOCK, /* xopen_flags = */ 0, @@ -1173,60 +1173,47 @@ static int parse_acls_from_arg(Item *item) { #if HAVE_ACL static int parse_acl_cond_exec( const char *path, - acl_t access, /* could be empty (NULL) */ - acl_t cond_exec, const struct stat *st, + acl_t cond_exec, + acl_t access, /* could be empty (NULL) */ bool append, acl_t *ret) { - _cleanup_(acl_freep) acl_t parsed = NULL; acl_entry_t entry; acl_permset_t permset; bool has_exec; int r; assert(path); - assert(ret); assert(st); + assert(cond_exec); + assert(ret); - parsed = access ? acl_dup(access) : acl_init(0); - if (!parsed) - return -errno; - - /* Since we substitute 'X' with 'x' in parse_acl(), we just need to copy the entries over - * for directories */ - if (S_ISDIR(st->st_mode)) { - for (r = acl_get_entry(cond_exec, ACL_FIRST_ENTRY, &entry); - r > 0; - r = acl_get_entry(cond_exec, ACL_NEXT_ENTRY, &entry)) { - - acl_entry_t parsed_entry; - - if (acl_create_entry(&parsed, &parsed_entry) < 0) - return -errno; - - if (acl_copy_entry(parsed_entry, entry) < 0) - return -errno; - } - if (r < 0) - return -errno; - - goto finish; - } - - has_exec = st->st_mode & S_IXUSR; - - if (!has_exec && append) { + if (!S_ISDIR(st->st_mode)) { _cleanup_(acl_freep) acl_t old = NULL; old = acl_get_file(path, ACL_TYPE_ACCESS); if (!old) return -errno; + has_exec = false; + for (r = acl_get_entry(old, ACL_FIRST_ENTRY, &entry); r > 0; r = acl_get_entry(old, ACL_NEXT_ENTRY, &entry)) { + acl_tag_t tag; + + if (acl_get_tag_type(entry, &tag) < 0) + return -errno; + + if (tag == ACL_MASK) + continue; + + /* If not appending, skip ACL definitions */ + if (!append && IN_SET(tag, ACL_USER, ACL_GROUP)) + continue; + if (acl_get_permset(entry, &permset) < 0) return -errno; @@ -1240,28 +1227,33 @@ static int parse_acl_cond_exec( } if (r < 0) return -errno; - } - /* Check if we're about to set the execute bit in acl_access */ - if (!has_exec && access) { - for (r = acl_get_entry(access, ACL_FIRST_ENTRY, &entry); - r > 0; - r = acl_get_entry(access, ACL_NEXT_ENTRY, &entry)) { + /* Check if we're about to set the execute bit in acl_access */ + if (!has_exec && access) { + for (r = acl_get_entry(access, ACL_FIRST_ENTRY, &entry); + r > 0; + r = acl_get_entry(access, ACL_NEXT_ENTRY, &entry)) { - if (acl_get_permset(entry, &permset) < 0) - return -errno; + if (acl_get_permset(entry, &permset) < 0) + return -errno; - r = acl_get_perm(permset, ACL_EXECUTE); + r = acl_get_perm(permset, ACL_EXECUTE); + if (r < 0) + return -errno; + if (r > 0) { + has_exec = true; + break; + } + } if (r < 0) return -errno; - if (r > 0) { - has_exec = true; - break; - } } - if (r < 0) - return -errno; - } + } else + has_exec = true; + + _cleanup_(acl_freep) acl_t parsed = access ? acl_dup(access) : acl_init(0); + if (!parsed) + return -errno; for (r = acl_get_entry(cond_exec, ACL_FIRST_ENTRY, &entry); r > 0; @@ -1275,6 +1267,7 @@ static int parse_acl_cond_exec( if (acl_copy_entry(parsed_entry, entry) < 0) return -errno; + /* We substituted 'X' with 'x' in parse_acl(), so drop execute bit here if not applicable. */ if (!has_exec) { if (acl_get_permset(parsed_entry, &permset) < 0) return -errno; @@ -1286,7 +1279,6 @@ static int parse_acl_cond_exec( if (r < 0) return -errno; -finish: if (!append) { /* want_mask = true */ r = calc_acl_mask_if_needed(&parsed); if (r < 0) @@ -1390,10 +1382,9 @@ static int fd_set_acls( } if (item->acl_access_exec) { - r = parse_acl_cond_exec(FORMAT_PROC_FD_PATH(fd), - item->acl_access, + r = parse_acl_cond_exec(FORMAT_PROC_FD_PATH(fd), st, item->acl_access_exec, - st, + item->acl_access, item->append_or_force, &access_with_exec_parsed); if (r < 0) diff --git a/src/tpm2-setup/tpm2-setup.c b/src/tpm2-setup/tpm2-setup.c index 0be7ffc..35628fc 100644 --- a/src/tpm2-setup/tpm2-setup.c +++ b/src/tpm2-setup/tpm2-setup.c @@ -18,6 +18,7 @@ static char *arg_tpm2_device = NULL; static bool arg_early = false; +static bool arg_graceful = false; STATIC_DESTRUCTOR_REGISTER(arg_tpm2_device, freep); @@ -43,6 +44,7 @@ static int help(int argc, char *argv[], void *userdata) { " --tpm2-device=PATH\n" " Pick TPM2 device\n" " --early=BOOL Store SRK public key in /run/ rather than /var/lib/\n" + " --graceful Exit gracefully if no TPM2 device is found\n" "\nSee the %2$s for details.\n", program_invocation_short_name, link, @@ -59,6 +61,7 @@ static int parse_argv(int argc, char *argv[]) { ARG_VERSION = 0x100, ARG_TPM2_DEVICE, ARG_EARLY, + ARG_GRACEFUL, }; static const struct option options[] = { @@ -66,6 +69,7 @@ static int parse_argv(int argc, char *argv[]) { { "version", no_argument, NULL, ARG_VERSION }, { "tpm2-device", required_argument, NULL, ARG_TPM2_DEVICE }, { "early", required_argument, NULL, ARG_EARLY }, + { "graceful", no_argument, NULL, ARG_GRACEFUL }, {} }; @@ -100,6 +104,10 @@ static int parse_argv(int argc, char *argv[]) { arg_early = r; break; + case ARG_GRACEFUL: + arg_graceful = true; + break; + case '?': return -EINVAL; @@ -204,9 +212,9 @@ static int load_public_key_tpm2(struct public_key_data *ret) { assert(ret); - r = tpm2_context_new(arg_tpm2_device, &c); + r = tpm2_context_new_or_warn(arg_tpm2_device, &c); if (r < 0) - return log_error_errno(r, "Failed to create TPM2 context: %m"); + return r; r = tpm2_get_or_create_srk( c, @@ -247,6 +255,11 @@ static int run(int argc, char *argv[]) { if (r <= 0) return r; + if (arg_graceful && tpm2_support() != TPM2_SUPPORT_FULL) { + log_notice("No complete TPM2 support detected, exiting gracefully."); + return EXIT_SUCCESS; + } + umask(0022); _cleanup_(public_key_data_done) struct public_key_data runtime_key = {}, persistent_key = {}, tpm2_key = {}; diff --git a/src/ukify/ukify.py b/src/ukify/ukify.py index 6abf1b6..b0d0961 100755 --- a/src/ukify/ukify.py +++ b/src/ukify/ukify.py @@ -40,6 +40,7 @@ import subprocess import sys import tempfile import textwrap +import struct from hashlib import sha256 from typing import (Any, Callable, @@ -128,6 +129,45 @@ def try_import(modname, name=None): except ImportError as e: raise ValueError(f'Kernel is compressed with {name or modname}, but module unavailable') from e +def get_zboot_kernel(f): + """Decompress zboot efistub kernel if compressed. Return contents.""" + # See linux/drivers/firmware/efi/libstub/Makefile.zboot + # and linux/drivers/firmware/efi/libstub/zboot-header.S + + # 4 bytes at offset 0x08 contain the starting offset of compressed data + f.seek(8) + _start = f.read(4) + start = struct.unpack('n_entries; i++) { UidRangeEntry *x = p->entries + i; @@ -534,7 +537,8 @@ static int table_add_gid_boundaries(Table *table, const UidRange *p) { for (size_t i = 0; i < ELEMENTSOF(uid_range_table); i++) { _cleanup_free_ char *name = NULL, *comment = NULL; - if (!uid_range_covers(p, uid_range_table[i].first, uid_range_table[i].last)) + if (!uid_range_covers(p, uid_range_table[i].first, + uid_range_table[i].last - uid_range_table[i].first + 1)) continue; name = strjoin(special_glyph(SPECIAL_GLYPH_ARROW_DOWN), diff --git a/src/userdb/userdbd-manager.c b/src/userdb/userdbd-manager.c index c1dfe47..359c827 100644 --- a/src/userdb/userdbd-manager.c +++ b/src/userdb/userdbd-manager.c @@ -5,6 +5,7 @@ #include "sd-daemon.h" #include "common-signal.h" +#include "env-util.h" #include "fd-util.h" #include "fs-util.h" #include "mkdir.h" @@ -156,7 +157,6 @@ static int start_one_worker(Manager *m) { if (r < 0) return log_error_errno(r, "Failed to fork new worker child: %m"); if (r == 0) { - char pids[DECIMAL_STR_MAX(pid_t)]; /* Child */ if (m->listen_fd == 3) { @@ -174,9 +174,9 @@ static int start_one_worker(Manager *m) { safe_close(m->listen_fd); } - xsprintf(pids, PID_FMT, pid); - if (setenv("LISTEN_PID", pids, 1) < 0) { - log_error_errno(errno, "Failed to set $LISTEN_PID: %m"); + r = setenvf("LISTEN_PID", /* overwrite= */ true, PID_FMT, pid); + if (r < 0) { + log_error_errno(r, "Failed to set $LISTEN_PID: %m"); _exit(EXIT_FAILURE); } diff --git a/test/TEST-69-SHUTDOWN/test.sh b/test/TEST-69-SHUTDOWN/test.sh index 8fdbaf8..0e12857 100755 --- a/test/TEST-69-SHUTDOWN/test.sh +++ b/test/TEST-69-SHUTDOWN/test.sh @@ -38,6 +38,7 @@ EOF inst /usr/bin/screen echo "PS1='screen\$WINDOW # '" >>"$workspace/root/.bashrc" + echo "TERM=linux" >>"$workspace/root/.bash_profile" echo 'startup_message off' >"$workspace/etc/screenrc" echo 'bell_msg ""' >>"$workspace/etc/screenrc" } diff --git a/test/test-functions b/test/test-functions index 0698b30..f7376bf 100644 --- a/test/test-functions +++ b/test/test-functions @@ -876,6 +876,7 @@ EOF [Service] Type=oneshot RemainAfterExit=yes +SyslogIdentifier=sysext-foo ExecStart=echo foo [Install] @@ -2102,7 +2103,7 @@ install_testuser() { # create unprivileged user for user manager tests mkdir -p "${initdir:?}/etc/sysusers.d" cat >"$initdir/etc/sysusers.d/testuser.conf" < 0: time.sleep(sleep_time) +def resolvectl(*args): + return check_output(*(resolvectl_cmd + list(args)), env=env) + +def timedatectl(*args): + return check_output(*(timedatectl_cmd + list(args)), env=env) + def setup_common(): print() @@ -891,7 +913,6 @@ class Utilities(): def wait_activated(self, link, state='down', timeout=20, fail_assert=True): # wait for the interface is activated. - invocation_id = check_output('systemctl show systemd-networkd -p InvocationID --value') needle = f'{link}: Bringing link {state}' flag = state.upper() for iteration in range(timeout + 1): @@ -899,7 +920,7 @@ class Utilities(): time.sleep(1) if not link_exists(link): continue - output = check_output('journalctl _SYSTEMD_INVOCATION_ID=' + invocation_id) + output = read_networkd_log() if needle in output and flag in check_output(f'ip link show {link}'): return True if fail_assert: @@ -930,7 +951,7 @@ class Utilities(): time.sleep(1) if not link_exists(link): continue - output = check_output(*networkctl_cmd, '-n', '0', 'status', link, env=env) + output = networkctl_status(link) if re.search(rf'(?m)^\s*State:\s+{operstate}\s+\({setup_state}\)\s*$', output): return True @@ -971,11 +992,15 @@ class Utilities(): try: check_output(*args, env=wait_online_env) except subprocess.CalledProcessError: - # show detailed status on failure - for link in links_with_operstate: - name = link.split(':')[0] - if link_exists(name): - call(*networkctl_cmd, '-n', '0', 'status', name, env=env) + if networkd_is_failed(): + print('!!!!! systemd-networkd.service is failed !!!!!') + call('systemctl status systemd-networkd.service') + else: + # show detailed status on failure + for link in links_with_operstate: + name = link.split(':')[0] + if link_exists(name): + networkctl_status(name) raise if not bool_any and setup_state: for link in links_with_operstate: @@ -1068,7 +1093,7 @@ class NetworkctlTests(unittest.TestCase, Utilities): start_networkd() self.wait_online(['dummy98:degraded']) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'dummy98', env=env) + output = networkctl_status('dummy98') self.assertRegex(output, 'hogehogehogehogehogehoge') @expectedFailureIfAlternativeNameIsNotAvailable() @@ -1078,7 +1103,7 @@ class NetworkctlTests(unittest.TestCase, Utilities): start_networkd() self.wait_online(['dummyalt:degraded']) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'dummyalt', env=env) + output = networkctl_status('dummyalt') self.assertIn('hogehogehogehogehogehoge', output) self.assertNotIn('dummy98', output) @@ -1130,7 +1155,7 @@ class NetworkctlTests(unittest.TestCase, Utilities): def test_renew(self): def check(): self.wait_online(['veth99:routable', 'veth-peer:routable']) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'veth99', env=env) + output = networkctl_status('veth99') print(output) self.assertRegex(output, r'Address: 192.168.5.[0-9]* \(DHCP4 via 192.168.5.1\)') self.assertIn('Gateway: 192.168.5.3', output) @@ -1140,13 +1165,12 @@ class NetworkctlTests(unittest.TestCase, Utilities): copy_network_unit('25-veth.netdev', '25-dhcp-client.network', '25-dhcp-server.network') start_networkd() check() - output = check_output(*networkctl_cmd, '--lines=0', '--stats', '--all', '--full', '--json=short', 'status') - check_json(output) + check_json(networkctl_json('--lines=0', '--stats', '--all', '--full')) for verb in ['renew', 'forcerenew']: - call_check(*networkctl_cmd, verb, 'veth99') + networkctl(verb, 'veth99') check() - call_check(*networkctl_cmd, verb, 'veth99', 'veth99', 'veth99') + networkctl(verb, 'veth99', 'veth99', 'veth99') check() def test_up_down(self): @@ -1154,13 +1178,13 @@ class NetworkctlTests(unittest.TestCase, Utilities): start_networkd() self.wait_online(['dummy98:routable']) - call_check(*networkctl_cmd, 'down', 'dummy98') + networkctl('down', 'dummy98') self.wait_online(['dummy98:off']) - call_check(*networkctl_cmd, 'up', 'dummy98') + networkctl('up', 'dummy98') self.wait_online(['dummy98:routable']) - call_check(*networkctl_cmd, 'down', 'dummy98', 'dummy98', 'dummy98') + networkctl('down', 'dummy98', 'dummy98', 'dummy98') self.wait_online(['dummy98:off']) - call_check(*networkctl_cmd, 'up', 'dummy98', 'dummy98', 'dummy98') + networkctl('up', 'dummy98', 'dummy98', 'dummy98') self.wait_online(['dummy98:routable']) def test_reload(self): @@ -1192,23 +1216,23 @@ class NetworkctlTests(unittest.TestCase, Utilities): self.wait_online(['test1:degraded']) - output = check_output(*networkctl_cmd, 'list', env=env) + output = networkctl('list') self.assertRegex(output, '1 lo ') self.assertRegex(output, 'test1') - output = check_output(*networkctl_cmd, 'list', 'test1', env=env) + output = networkctl('list', 'test1') self.assertNotRegex(output, '1 lo ') self.assertRegex(output, 'test1') - output = check_output(*networkctl_cmd, 'list', 'te*', env=env) + output = networkctl('list', 'te*') self.assertNotRegex(output, '1 lo ') self.assertRegex(output, 'test1') - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'te*', env=env) + output = networkctl_status('te*') self.assertNotRegex(output, '1: lo ') self.assertRegex(output, 'test1') - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'tes[a-z][0-9]', env=env) + output = networkctl_status('tes[a-z][0-9]') self.assertNotRegex(output, '1: lo ') self.assertRegex(output, 'test1') @@ -1218,7 +1242,7 @@ class NetworkctlTests(unittest.TestCase, Utilities): self.wait_online(['test1:degraded']) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'test1', env=env) + output = networkctl_status('test1') self.assertRegex(output, 'MTU: 1600') def test_type(self): @@ -1226,11 +1250,11 @@ class NetworkctlTests(unittest.TestCase, Utilities): start_networkd() self.wait_online(['test1:degraded']) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'test1', env=env) + output = networkctl_status('test1') print(output) self.assertRegex(output, 'Type: ether') - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'lo', env=env) + output = networkctl_status('lo') print(output) self.assertRegex(output, 'Type: loopback') @@ -1239,7 +1263,7 @@ class NetworkctlTests(unittest.TestCase, Utilities): start_networkd() self.wait_online(['test1:degraded']) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'test1', env=env) + output = networkctl_status('test1') print(output) self.assertRegex(output, r'Link File: /run/systemd/network/25-default.link') self.assertRegex(output, r'Network File: /run/systemd/network/11-dummy.network') @@ -1248,7 +1272,7 @@ class NetworkctlTests(unittest.TestCase, Utilities): # In that case, the udev DB for the loopback network interface may already have ID_NET_LINK_FILE property. # Let's reprocess the interface and drop the property. check_output(*udevadm_cmd, 'trigger', '--settle', '--action=add', '/sys/class/net/lo') - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'lo', env=env) + output = networkctl_status('lo') print(output) self.assertRegex(output, r'Link File: n/a') self.assertRegex(output, r'Network File: n/a') @@ -1260,13 +1284,13 @@ class NetworkctlTests(unittest.TestCase, Utilities): self.wait_online(['test1:degraded', 'veth99:degraded', 'veth-peer:degraded']) - check_output(*networkctl_cmd, 'delete', 'test1', 'veth99', env=env) + networkctl('delete', 'test1', 'veth99') self.check_link_exists('test1', expected=False) self.check_link_exists('veth99', expected=False) self.check_link_exists('veth-peer', expected=False) def test_label(self): - call_check(*networkctl_cmd, 'label') + networkctl('label') class NetworkdMatchTests(unittest.TestCase, Utilities): @@ -1287,7 +1311,7 @@ class NetworkdMatchTests(unittest.TestCase, Utilities): start_networkd() self.wait_online(['dummy98:routable']) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'dummy98', env=env) + output = networkctl_status('dummy98') self.assertIn('Network File: /run/systemd/network/12-dummy-match-mac-01.network', output) output = check_output('ip -4 address show dev dummy98') self.assertIn('10.0.0.1/16', output) @@ -1297,7 +1321,7 @@ class NetworkdMatchTests(unittest.TestCase, Utilities): self.wait_address('dummy98', '10.0.0.2/16', ipv='-4', timeout_sec=10) self.wait_online(['dummy98:routable']) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'dummy98', env=env) + output = networkctl_status('dummy98') self.assertIn('Network File: /run/systemd/network/12-dummy-match-mac-02.network', output) check_output('ip link set dev dummy98 down') @@ -1305,7 +1329,7 @@ class NetworkdMatchTests(unittest.TestCase, Utilities): self.wait_address('dummy98-1', '10.0.1.2/16', ipv='-4', timeout_sec=10) self.wait_online(['dummy98-1:routable']) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'dummy98-1', env=env) + output = networkctl_status('dummy98-1') self.assertIn('Network File: /run/systemd/network/12-dummy-match-renamed.network', output) check_output('ip link set dev dummy98-1 down') @@ -1314,7 +1338,7 @@ class NetworkdMatchTests(unittest.TestCase, Utilities): self.wait_address('dummy98-2', '10.0.2.2/16', ipv='-4', timeout_sec=10) self.wait_online(['dummy98-2:routable']) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'dummy98-2', env=env) + output = networkctl_status('dummy98-2') self.assertIn('Network File: /run/systemd/network/12-dummy-match-altname.network', output) def test_match_udev_property(self): @@ -1322,7 +1346,7 @@ class NetworkdMatchTests(unittest.TestCase, Utilities): start_networkd() self.wait_online(['dummy98:routable']) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'dummy98', env=env) + output = networkctl_status('dummy98') print(output) self.assertRegex(output, 'Network File: /run/systemd/network/14-match-udev-property') @@ -1401,7 +1425,7 @@ class NetworkdNetDevTests(unittest.TestCase, Utilities): self.assertEqual(1, int(read_link_attr('bridge99', 'bridge', 'stp_state'))) self.assertEqual(3, int(read_link_attr('bridge99', 'bridge', 'multicast_igmp_version'))) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'bridge99', env=env) + output = networkctl_status('bridge99') print(output) self.assertRegex(output, 'Priority: 9') self.assertRegex(output, 'STP: yes') @@ -1434,14 +1458,14 @@ class NetworkdNetDevTests(unittest.TestCase, Utilities): self.check_link_attr('bond98', 'bonding', 'mode', 'balance-tlb 5') self.check_link_attr('bond98', 'bonding', 'tlb_dynamic_lb', '1') - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'bond99', env=env) + output = networkctl_status('bond99') print(output) self.assertIn('Mode: 802.3ad', output) self.assertIn('Miimon: 1s', output) self.assertIn('Updelay: 2s', output) self.assertIn('Downdelay: 2s', output) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'bond98', env=env) + output = networkctl_status('bond98') print(output) self.assertIn('Mode: balance-tlb', output) @@ -2314,7 +2338,7 @@ class NetworkdNetDevTests(unittest.TestCase, Utilities): self.assertIn('00:11:22:33:44:66 dst 10.0.0.6 self permanent', output) self.assertIn('00:11:22:33:44:77 dst 10.0.0.7 via test1 self permanent', output) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'vxlan99', env=env) + output = networkctl_status('vxlan99') print(output) self.assertIn('VNI: 999', output) self.assertIn('Destination Port: 5555', output) @@ -2555,8 +2579,7 @@ class NetworkdNetworkTests(unittest.TestCase, Utilities): # netlabel self.check_netlabel('dummy98', r'10\.10\.1\.0/24') - output = check_output(*networkctl_cmd, '--json=short', 'status', env=env) - check_json(output) + check_json(networkctl_json()) def test_address_static(self): copy_network_unit('25-address-static.network', '12-dummy.netdev', copy_dropins=False) @@ -2873,7 +2896,7 @@ class NetworkdNetworkTests(unittest.TestCase, Utilities): check_output(f'ip link set dev test1 carrier {carrier}') self.wait_online([f'test1:{routable_map[carrier]}:{routable_map[carrier]}']) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'test1', env=env) + output = networkctl_status('test1') print(output) self.assertRegex(output, '192.168.0.15') self.assertRegex(output, '192.168.0.1') @@ -2897,7 +2920,7 @@ class NetworkdNetworkTests(unittest.TestCase, Utilities): check_output(f'ip link set dev test1 carrier {carrier}') self.wait_online([f'test1:{routable_map[carrier]}:{routable_map[carrier]}']) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'test1', env=env) + output = networkctl_status('test1') print(output) if have_config: self.assertRegex(output, '192.168.0.15') @@ -2942,8 +2965,7 @@ class NetworkdNetworkTests(unittest.TestCase, Utilities): self.assertRegex(output, 'iif test1') self.assertRegex(output, 'lookup 10') - output = check_output(*networkctl_cmd, '--json=short', 'status', env=env) - check_json(output) + check_json(networkctl_json()) def test_routing_policy_rule_issue_11280(self): copy_network_unit('25-routing-policy-rule-test1.network', '11-dummy.netdev', @@ -3071,7 +3093,7 @@ class NetworkdNetworkTests(unittest.TestCase, Utilities): start_networkd() self.wait_online(['dummy98:routable']) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'dummy98', env=env) + output = networkctl_status('dummy98') print(output) print('### ip -6 route show dev dummy98') @@ -3174,8 +3196,7 @@ class NetworkdNetworkTests(unittest.TestCase, Utilities): self.assertIn('via 2001:1234:5:8fff:ff:ff:ff:ff dev dummy98', output) self.assertIn('via 2001:1234:5:9fff:ff:ff:ff:ff dev dummy98', output) - output = check_output(*networkctl_cmd, '--json=short', 'status', env=env) - check_json(output) + check_json(networkctl_json()) copy_network_unit('25-address-static.network') networkctl_reload() @@ -3301,7 +3322,7 @@ class NetworkdNetworkTests(unittest.TestCase, Utilities): start_networkd() self.wait_online(['dummy98:routable']) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'dummy98', env=env) + output = networkctl_status('dummy98') print(output) print('### ip -6 route show dev dummy98') @@ -3446,8 +3467,7 @@ class NetworkdNetworkTests(unittest.TestCase, Utilities): self.assertNotIn('192.168.10.2', output) self.assertNotIn('00:00:5e:00:02:67', output) - output = check_output(*networkctl_cmd, '--json=short', 'status', env=env) - check_json(output) + check_json(networkctl_json()) copy_network_unit('25-neighbor-section.network.d/override.conf') networkctl_reload() @@ -3500,8 +3520,7 @@ class NetworkdNetworkTests(unittest.TestCase, Utilities): self.assertRegex(output, '2001:db8:0:f102::17 lladdr 2a:?00:ff:?de:45:?67:ed:?de:[0:]*:49:?88 PERMANENT') self.assertNotIn('2001:db8:0:f102::18', output) - output = check_output(*networkctl_cmd, '--json=short', 'status', env=env) - check_json(output) + check_json(networkctl_json()) def test_link_local_addressing(self): copy_network_unit('25-link-local-addressing-yes.network', '11-dummy.netdev', @@ -3562,6 +3581,7 @@ class NetworkdNetworkTests(unittest.TestCase, Utilities): print(output) self.assertRegex(output, 'inet6 .* scope link') + @unittest.skip("Re-enable once https://github.com/systemd/systemd/issues/30056 is resolved") def test_sysctl(self): copy_networkd_conf_dropin('25-global-ipv6-privacy-extensions.conf') copy_network_unit('25-sysctl.network', '12-dummy.netdev', copy_dropins=False) @@ -3790,7 +3810,7 @@ class NetworkdNetworkTests(unittest.TestCase, Utilities): # default is true, if neither are specified expected = True - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'test1', env=env) + output = networkctl_status('test1') print(output) yesno = 'yes' if expected else 'no' @@ -3814,7 +3834,7 @@ class NetworkdNetworkTests(unittest.TestCase, Utilities): start_networkd() self.wait_online(['dummy98:routable']) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'dummy98', env=env) + output = networkctl_status('dummy98') print(output) self.assertRegex(output, 'Address: 192.168.42.100') self.assertRegex(output, 'DNS: 192.168.42.1') @@ -3900,8 +3920,7 @@ class NetworkdNetworkTests(unittest.TestCase, Utilities): self.assertIn('nexthop via 192.168.20.1 dev dummy98 weight 1', output) self.assertIn('nexthop via 192.168.5.1 dev veth99 weight 3', output) - output = check_output(*networkctl_cmd, '--json=short', 'status', env=env) - check_json(output) + check_json(networkctl_json()) copy_network_unit('25-nexthop.network', '25-veth.netdev', '25-veth-peer.network', '12-dummy.netdev', '25-nexthop-dummy.network') @@ -4204,6 +4223,23 @@ class NetworkdTCTests(unittest.TestCase, Utilities): print(output) self.assertRegex(output, 'qdisc teql1 31: root') + @expectedFailureIfModuleIsNotAvailable('sch_fq', 'sch_sfq', 'sch_tbf') + def test_qdisc_drop(self): + copy_network_unit('12-dummy.netdev', '12-dummy.network') + start_networkd() + self.wait_online(['dummy98:routable']) + + # Test case for issue #32247 and #32254. + for _ in range(20): + check_output('tc qdisc replace dev dummy98 root fq') + self.assertFalse(networkd_is_failed()) + check_output('tc qdisc replace dev dummy98 root fq pacing') + self.assertFalse(networkd_is_failed()) + check_output('tc qdisc replace dev dummy98 handle 10: root tbf rate 0.5mbit burst 5kb latency 70ms peakrate 1mbit minburst 1540') + self.assertFalse(networkd_is_failed()) + check_output('tc qdisc add dev dummy98 parent 10:1 handle 100: sfq') + self.assertFalse(networkd_is_failed()) + class NetworkdStateFileTests(unittest.TestCase, Utilities): def setUp(self): @@ -4218,10 +4254,9 @@ class NetworkdStateFileTests(unittest.TestCase, Utilities): self.wait_online(['dummy98:routable']) # make link state file updated - check_output(*resolvectl_cmd, 'revert', 'dummy98', env=env) + resolvectl('revert', 'dummy98') - output = check_output(*networkctl_cmd, '--json=short', 'status', env=env) - check_json(output) + check_json(networkctl_json()) output = read_link_state_file('dummy98') print(output) @@ -4242,15 +4277,14 @@ class NetworkdStateFileTests(unittest.TestCase, Utilities): self.assertIn('MDNS=yes', output) self.assertIn('DNSSEC=no', output) - check_output(*resolvectl_cmd, 'dns', 'dummy98', '10.10.10.12#ccc.com', '10.10.10.13', '1111:2222::3333', env=env) - check_output(*resolvectl_cmd, 'domain', 'dummy98', 'hogehogehoge', '~foofoofoo', env=env) - check_output(*resolvectl_cmd, 'llmnr', 'dummy98', 'yes', env=env) - check_output(*resolvectl_cmd, 'mdns', 'dummy98', 'no', env=env) - check_output(*resolvectl_cmd, 'dnssec', 'dummy98', 'yes', env=env) - check_output(*timedatectl_cmd, 'ntp-servers', 'dummy98', '2.fedora.pool.ntp.org', '3.fedora.pool.ntp.org', env=env) + resolvectl('dns', 'dummy98', '10.10.10.12#ccc.com', '10.10.10.13', '1111:2222::3333') + resolvectl('domain', 'dummy98', 'hogehogehoge', '~foofoofoo') + resolvectl('llmnr', 'dummy98', 'yes') + resolvectl('mdns', 'dummy98', 'no') + resolvectl('dnssec', 'dummy98', 'yes') + timedatectl('ntp-servers', 'dummy98', '2.fedora.pool.ntp.org', '3.fedora.pool.ntp.org') - output = check_output(*networkctl_cmd, '--json=short', 'status', env=env) - check_json(output) + check_json(networkctl_json()) output = read_link_state_file('dummy98') print(output) @@ -4262,10 +4296,9 @@ class NetworkdStateFileTests(unittest.TestCase, Utilities): self.assertIn('MDNS=no', output) self.assertIn('DNSSEC=yes', output) - check_output(*timedatectl_cmd, 'revert', 'dummy98', env=env) + timedatectl('revert', 'dummy98') - output = check_output(*networkctl_cmd, '--json=short', 'status', env=env) - check_json(output) + check_json(networkctl_json()) output = read_link_state_file('dummy98') print(output) @@ -4277,10 +4310,9 @@ class NetworkdStateFileTests(unittest.TestCase, Utilities): self.assertIn('MDNS=no', output) self.assertIn('DNSSEC=yes', output) - check_output(*resolvectl_cmd, 'revert', 'dummy98', env=env) + resolvectl('revert', 'dummy98') - output = check_output(*networkctl_cmd, '--json=short', 'status', env=env) - check_json(output) + check_json(networkctl_json()) output = read_link_state_file('dummy98') print(output) @@ -4668,7 +4700,7 @@ class NetworkdBridgeTests(unittest.TestCase, Utilities): self.wait_online(['bridge99:no-carrier:no-carrier']) self.check_link_attr('bridge99', 'carrier', '0') - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'bridge99', env=env) + output = networkctl_status('bridge99') self.assertRegex(output, '10.1.2.3') self.assertRegex(output, '10.1.2.1') @@ -4848,7 +4880,7 @@ class NetworkdLLDPTests(unittest.TestCase, Utilities): if trial > 0: time.sleep(1) - output = check_output(*networkctl_cmd, 'lldp', env=env) + output = networkctl('lldp') print(output) if re.search(r'veth99 .* veth-peer', output): break @@ -4871,16 +4903,16 @@ class NetworkdRATests(unittest.TestCase, Utilities): start_networkd() self.wait_online(['veth99:routable', 'veth-peer:degraded']) - output = check_output(*resolvectl_cmd, 'dns', 'veth99', env=env) + output = resolvectl('dns', 'veth99') print(output) self.assertRegex(output, 'fe80::') self.assertRegex(output, '2002:da8:1::1') - output = check_output(*resolvectl_cmd, 'domain', 'veth99', env=env) + output = resolvectl('domain', 'veth99') print(output) self.assertIn('hogehoge.test', output) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'veth99', env=env) + output = networkctl_status('veth99') print(output) self.assertRegex(output, '2002:da8:1:0') @@ -4900,7 +4932,7 @@ class NetworkdRATests(unittest.TestCase, Utilities): start_networkd() self.wait_online(['veth99:routable', 'veth-peer:degraded']) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'veth99', env=env) + output = networkctl_status('veth99') print(output) self.assertRegex(output, '2002:da8:1:0:1a:2b:3c:4d') self.assertRegex(output, '2002:da8:1:0:fa:de:ca:fe') @@ -4912,7 +4944,7 @@ class NetworkdRATests(unittest.TestCase, Utilities): start_networkd() self.wait_online(['veth99:routable', 'veth-peer:degraded']) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'veth99', env=env) + output = networkctl_status('veth99') print(output) self.assertIn('2002:da8:1:0:b47e:7975:fc7a:7d6e', output) self.assertIn('2002:da8:2:0:1034:56ff:fe78:9abc', output) # EUI64 @@ -4922,7 +4954,7 @@ class NetworkdRATests(unittest.TestCase, Utilities): start_networkd() self.wait_online(['veth99:routable', 'veth-peer:degraded']) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'veth99', env=env) + output = networkctl_status('veth99') print(output) self.assertIn('2002:da8:1:0:b47e:7975:fc7a:7d6e', output) self.assertIn('2002:da8:2:0:f689:561a:8eda:7443', output) @@ -4994,7 +5026,7 @@ class NetworkdRATests(unittest.TestCase, Utilities): self.wait_online(['client:routable']) self.wait_address('client', '2002:da8:1:99:1034:56ff:fe78:9a00/64', ipv='-6', timeout_sec=10) - output = check_output(*networkctl_cmd, 'status', 'client', env=env) + output = networkctl_status('client') print(output) self.assertIn('Captive Portal: http://systemd.io', output) @@ -5030,7 +5062,7 @@ class NetworkdRATests(unittest.TestCase, Utilities): self.wait_online(['client:routable']) self.wait_address('client', '2002:da8:1:99:1034:56ff:fe78:9a00/64', ipv='-6', timeout_sec=10) - output = check_output(*networkctl_cmd, 'status', 'client', env=env) + output = networkctl_status('client') print(output) self.assertNotIn('Captive Portal:', output) @@ -5047,14 +5079,14 @@ class NetworkdDHCPServerTests(unittest.TestCase, Utilities): start_networkd() self.wait_online(['veth99:routable', 'veth-peer:routable']) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'veth99', env=env) + output = networkctl_status('veth99') print(output) self.assertRegex(output, r'Address: 192.168.5.[0-9]* \(DHCP4 via 192.168.5.1\)') self.assertIn('Gateway: 192.168.5.3', output) self.assertRegex(output, 'DNS: 192.168.5.1\n *192.168.5.10') self.assertRegex(output, 'NTP: 192.168.5.1\n *192.168.5.11') - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'veth-peer', env=env) + output = networkctl_status('veth-peer') self.assertRegex(output, "Offered DHCP leases: 192.168.5.[0-9]*") def test_dhcp_server_null_server_address(self): @@ -5070,14 +5102,14 @@ class NetworkdDHCPServerTests(unittest.TestCase, Utilities): client_address = json.loads(output)[0]['addr_info'][0]['local'] print(client_address) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'veth99', env=env) + output = networkctl_status('veth99') print(output) self.assertRegex(output, rf'Address: {client_address} \(DHCP4 via {server_address}\)') self.assertIn(f'Gateway: {server_address}', output) self.assertIn(f'DNS: {server_address}', output) self.assertIn(f'NTP: {server_address}', output) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'veth-peer', env=env) + output = networkctl_status('veth-peer') self.assertIn(f'Offered DHCP leases: {client_address}', output) def test_dhcp_server_with_uplink(self): @@ -5086,7 +5118,7 @@ class NetworkdDHCPServerTests(unittest.TestCase, Utilities): start_networkd() self.wait_online(['veth99:routable', 'veth-peer:routable']) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'veth99', env=env) + output = networkctl_status('veth99') print(output) self.assertRegex(output, r'Address: 192.168.5.[0-9]* \(DHCP4 via 192.168.5.1\)') self.assertIn('Gateway: 192.168.5.3', output) @@ -5098,7 +5130,7 @@ class NetworkdDHCPServerTests(unittest.TestCase, Utilities): start_networkd() self.wait_online(['veth99:routable', 'veth-peer:routable']) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'veth99', env=env) + output = networkctl_status('veth99') print(output) self.assertRegex(output, r'Address: 192.168.5.[0-9]* \(DHCP4 via 192.168.5.1\)') self.assertIn('Gateway: 192.168.5.1', output) @@ -5109,7 +5141,7 @@ class NetworkdDHCPServerTests(unittest.TestCase, Utilities): start_networkd() self.wait_online(['veth99:routable', 'veth-peer:routable']) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'veth99', env=env) + output = networkctl_status('veth99') print(output) self.assertIn('Address: 10.1.1.200 (DHCP4 via 10.1.1.1)', output) self.assertIn('DHCP4 Client ID: 12:34:56:78:9a:bc', output) @@ -5119,7 +5151,7 @@ class NetworkdDHCPServerTests(unittest.TestCase, Utilities): start_networkd() self.wait_online(['veth99:routable', 'veth-peer:routable']) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'veth99', env=env) + output = networkctl_status('veth99') print(output) self.assertIn('Address: 10.1.1.200 (DHCP4 via 10.1.1.1)', output) self.assertRegex(output, 'DHCP4 Client ID: IAID:[0-9a-z]*/DUID') @@ -5143,7 +5175,7 @@ class NetworkdDHCPServerRelayAgentTests(unittest.TestCase, Utilities): self.wait_online(['client:routable']) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'client', env=env) + output = networkctl_status('client') print(output) self.assertRegex(output, r'Address: 192.168.5.150 \(DHCP4 via 192.168.5.1\)') @@ -5202,8 +5234,7 @@ class NetworkdDHCPClientTests(unittest.TestCase, Utilities): self.assertNotIn('DHCPREPLY(veth-peer)', output) # Check json format - output = check_output(*networkctl_cmd, '--json=short', 'status', 'veth99', env=env) - check_json(output) + check_json(networkctl_json('veth99')) # solicit mode stop_dnsmasq() @@ -5230,7 +5261,7 @@ class NetworkdDHCPClientTests(unittest.TestCase, Utilities): self.assertRegex(output, 'token :: dev veth99') # Make manager and link state file updated - check_output(*resolvectl_cmd, 'revert', 'veth99', env=env) + resolvectl('revert', 'veth99') # Check link state file print('## link state file') @@ -5257,8 +5288,7 @@ class NetworkdDHCPClientTests(unittest.TestCase, Utilities): self.assertIn('sent size: 0 option: 14 rapid-commit', output) # Check json format - output = check_output(*networkctl_cmd, '--json=short', 'status', 'veth99', env=env) - check_json(output) + check_json(networkctl_json('veth99')) # Testing without rapid commit support with open(os.path.join(network_unit_dir, '25-dhcp-client-ipv6-only.network'), mode='a', encoding='utf-8') as f: @@ -5284,7 +5314,7 @@ class NetworkdDHCPClientTests(unittest.TestCase, Utilities): self.assertRegex(output, 'via fe80::1034:56ff:fe78:9abd') # Make manager and link state file updated - check_output(*resolvectl_cmd, 'revert', 'veth99', env=env) + resolvectl('revert', 'veth99') # Check link state file print('## link state file') @@ -5311,8 +5341,7 @@ class NetworkdDHCPClientTests(unittest.TestCase, Utilities): self.assertNotIn('rapid-commit', output) # Check json format - output = check_output(*networkctl_cmd, '--json=short', 'status', 'veth99', env=env) - check_json(output) + check_json(networkctl_json('veth99')) def test_dhcp_client_ipv6_dbus_status(self): copy_network_unit('25-veth.netdev', '25-dhcp-server-veth-peer.network', '25-dhcp-client-ipv6-only.network') @@ -5352,7 +5381,7 @@ class NetworkdDHCPClientTests(unittest.TestCase, Utilities): # Test renew command # See https://github.com/systemd/systemd/pull/29472#issuecomment-1759092138 - check_output(*networkctl_cmd, 'renew', 'veth99', env=env) + networkctl('renew', 'veth99') for _ in range(100): state = get_dhcp4_client_state('veth99') @@ -5459,8 +5488,7 @@ class NetworkdDHCPClientTests(unittest.TestCase, Utilities): self.assertIn('DOMAINS=example.com', output) print('## json') - output = check_output(*networkctl_cmd, '--json=short', 'status', 'veth99', env=env) - j = json.loads(output) + j = json.loads(networkctl_json('veth99')) self.assertEqual(len(j['DNS']), 2) for i in j['DNS']: @@ -5555,8 +5583,7 @@ class NetworkdDHCPClientTests(unittest.TestCase, Utilities): self.assertIn('DOMAINS=foo.example.com', output) print('## json') - output = check_output(*networkctl_cmd, '--json=short', 'status', 'veth99', env=env) - j = json.loads(output) + j = json.loads(networkctl_json('veth99')) self.assertEqual(len(j['DNS']), 3) for i in j['DNS']: @@ -5778,8 +5805,7 @@ class NetworkdDHCPClientTests(unittest.TestCase, Utilities): self.assertNotRegex(output, r'8.8.8.8 via 192.168.5.[0-9]* proto dhcp src 192.168.5.[0-9]* metric 1024') self.assertNotRegex(output, r'9.9.9.9 via 192.168.5.[0-9]* proto dhcp src 192.168.5.[0-9]* metric 1024') - output = check_output(*networkctl_cmd, '--json=short', 'status', env=env) - check_json(output) + check_json(networkctl_json()) def test_dhcp_client_settings_anonymize(self): copy_network_unit('25-veth.netdev', '25-dhcp-server-veth-peer.network', '25-dhcp-client-anonymize.network') @@ -5956,7 +5982,7 @@ class NetworkdDHCPClientTests(unittest.TestCase, Utilities): start_dnsmasq() self.wait_online(['veth99:routable', 'veth-peer:routable']) - output = check_output(*networkctl_cmd, '-n', '0', 'status', 'veth99', env=env) + output = networkctl_status('veth99') print(output) self.assertRegex(output, '192.168.5') @@ -6020,9 +6046,9 @@ class NetworkdDHCPClientTests(unittest.TestCase, Utilities): self.wait_address('veth99', r'inet6 2600::[0-9a-f]*/128 scope global (dynamic noprefixroute|noprefixroute dynamic)', ipv='-6') # make resolved re-read the link state file - check_output(*resolvectl_cmd, 'revert', 'veth99', env=env) + resolvectl('revert', 'veth99') - output = check_output(*resolvectl_cmd, 'dns', 'veth99', env=env) + output = resolvectl('dns', 'veth99') print(output) if ipv4: self.assertIn('192.168.5.1', output) @@ -6033,8 +6059,7 @@ class NetworkdDHCPClientTests(unittest.TestCase, Utilities): else: self.assertNotIn('2600::1', output) - output = check_output(*networkctl_cmd, '--json=short', 'status', env=env) - check_json(output) + check_json(networkctl_json()) copy_network_unit('25-veth.netdev', '25-dhcp-server-veth-peer.network', '25-dhcp-client.network', copy_dropins=False) @@ -6065,15 +6090,14 @@ class NetworkdDHCPClientTests(unittest.TestCase, Utilities): self.wait_address('veth99', r'inet 192.168.5.[0-9]*/24 metric 1024 brd 192.168.5.255 scope global dynamic', ipv='-4') self.wait_address('veth99', r'inet6 2600::[0-9a-f]*/128 scope global (dynamic noprefixroute|noprefixroute dynamic)', ipv='-6') - output = check_output(*networkctl_cmd, 'status', 'veth99', env=env) + output = networkctl_status('veth99') print(output) if ipv4 or ipv6: self.assertIn('Captive Portal: http://systemd.io', output) else: self.assertNotIn('Captive Portal: http://systemd.io', output) - output = check_output(*networkctl_cmd, '--json=short', 'status', env=env) - check_json(output) + check_json(networkctl_json()) copy_network_unit('25-veth.netdev', '25-dhcp-server-veth-peer.network', '25-dhcp-client.network', copy_dropins=False) @@ -6104,13 +6128,12 @@ class NetworkdDHCPClientTests(unittest.TestCase, Utilities): self.wait_address('veth99', r'inet 192.168.5.[0-9]*/24 metric 1024 brd 192.168.5.255 scope global dynamic', ipv='-4') self.wait_address('veth99', r'inet6 2600::[0-9a-f]*/128 scope global (dynamic noprefixroute|noprefixroute dynamic)', ipv='-6') - output = check_output(*networkctl_cmd, 'status', 'veth99', env=env) + output = networkctl_status('veth99') print(output) self.assertNotIn('Captive Portal: ', output) self.assertNotIn('invalid/url', output) - output = check_output(*networkctl_cmd, '--json=short', 'status', env=env) - check_json(output) + check_json(networkctl_json()) copy_network_unit('25-veth.netdev', '25-dhcp-server-veth-peer.network', '25-dhcp-client.network', copy_dropins=False) @@ -6686,18 +6709,17 @@ class NetworkdIPv6PrefixTests(unittest.TestCase, Utilities): self.assertIn('inet6 2001:db8:0:2:fa:de:ca:fe', output) self.assertNotIn('inet6 2001:db8:0:3:', output) - output = check_output(*resolvectl_cmd, 'dns', 'veth-peer', env=env) + output = resolvectl('dns', 'veth-peer') print(output) self.assertRegex(output, '2001:db8:1:1::2') - output = check_output(*resolvectl_cmd, 'domain', 'veth-peer', env=env) + output = resolvectl('domain', 'veth-peer') print(output) self.assertIn('example.com', output) - output = check_output(*networkctl_cmd, '--json=short', 'status', env=env) - check_json(output) + check_json(networkctl_json()) - output = check_output(*networkctl_cmd, '--json=short', 'status', 'veth-peer', env=env) + output = networkctl_json('veth-peer') check_json(output) # PREF64 or NAT64 @@ -6733,11 +6755,11 @@ class NetworkdIPv6PrefixTests(unittest.TestCase, Utilities): self.assertNotIn('inet6 2001:db8:0:1:', output) self.assertIn('inet6 2001:db8:0:2:', output) - output = check_output(*resolvectl_cmd, 'dns', 'veth-peer', env=env) + output = resolvectl('dns', 'veth-peer') print(output) self.assertRegex(output, '2001:db8:1:1::2') - output = check_output(*resolvectl_cmd, 'domain', 'veth-peer', env=env) + output = resolvectl('domain', 'veth-peer') print(output) self.assertIn('example.com', output) diff --git a/test/test-shutdown.py b/test/test-shutdown.py index e491f1e..d19a037 100755 --- a/test/test-shutdown.py +++ b/test/test-shutdown.py @@ -12,18 +12,21 @@ import pexpect def run(args): - ret = 1 logger = logging.getLogger("test-shutdown") + logfile = None + + if args.logfile: + logger.debug("Logging pexpect IOs to %s", args.logfile) + logfile = open(args.logfile, 'w') + elif args.verbose: + logfile = sys.stdout logger.info("spawning test") - console = pexpect.spawn(args.command, args.arg, env={ - "TERM": "linux", + console = pexpect.spawn(args.command, args.arg, logfile=logfile, env={ + "TERM": "dumb", }, encoding='utf-8', timeout=60) - if args.verbose: - console.logfile = sys.stdout - logger.debug("child pid %d", console.pid) try: @@ -39,12 +42,16 @@ def run(args): console.send('c') console.expect('screen1 ', 10) + logger.info('wait for the machine to fully boot') + console.sendline('systemctl is-system-running --wait') + console.expect(r'\b(running|degraded)\b', 60) + # console.interact() console.sendline('tty') console.expect(r'/dev/(pts/\d+)') pty = console.match.group(1) - logger.info("window 1 at line %s", pty) + logger.info("window 1 at tty %s", pty) logger.info("schedule reboot") console.sendline('shutdown -r') @@ -112,6 +119,7 @@ def run(args): def main(): parser = argparse.ArgumentParser(description='test logind shutdown feature') parser.add_argument("-v", "--verbose", action="store_true", help="verbose") + parser.add_argument("--logfile", metavar='FILE', help="Save all test input/output to the given path") parser.add_argument("command", help="command to run") parser.add_argument("arg", nargs='*', help="args for command") diff --git a/test/units/testsuite-04.journal-corrupt.sh b/test/units/testsuite-04.journal-corrupt.sh new file mode 100755 index 0000000..051d0ab --- /dev/null +++ b/test/units/testsuite-04.journal-corrupt.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: LGPL-2.1-or-later +set -eux +set -o pipefail + +journalctl --rotate --vacuum-files=1 +# Nuke all archived journals, so we start with a clean slate +rm -f "/var/log/journal/$(/dev/null 2>&1; then +if [[ ! -e /dev/loop-control ]]; then echo "No loopback device support" SECTOR_SIZES="512" fi @@ -108,7 +108,7 @@ for sector_size in $SECTOR_SIZES ; do rm -f "$BACKING_FILE" truncate -s "$disk_size" "$BACKING_FILE" - if losetup --find >/dev/null 2>&1; then + if [[ -e /dev/loop-control ]]; then # shellcheck disable=SC2086 blockdev="$(losetup --find --show --sector-size $sector_size $BACKING_FILE)" else diff --git a/test/units/testsuite-75.sh b/test/units/testsuite-75.sh index 5423448..86d602d 100755 --- a/test/units/testsuite-75.sh +++ b/test/units/testsuite-75.sh @@ -46,7 +46,8 @@ monitor_check_rr() ( # displayed. We turn off pipefail for this, since we don't care about the # lhs of this pipe expression, we only care about the rhs' result to be # clean - timeout -v 30s journalctl -u resolvectl-monitor.service --since "$since" -f --full | grep -m1 "$match" + # v255-only: match against a syslog tag as well to work around systemd/systemd#30886 + timeout -v 30s journalctl --since "$since" -f --full _SYSTEMD_UNIT="resolvectl-monitor.service" + SYSLOG_IDENTIFIER="resolvectl-monitor" | grep -m1 "$match" ) restart_resolved() { @@ -251,8 +252,8 @@ resolvectl status resolvectl log-level debug # Start monitoring queries -systemd-run -u resolvectl-monitor.service -p Type=notify resolvectl monitor -systemd-run -u resolvectl-monitor-json.service -p Type=notify resolvectl monitor --json=short +systemd-run -u resolvectl-monitor.service -p SyslogIdentifier=resolvectl-monitor -p Type=notify resolvectl monitor +systemd-run -u resolvectl-monitor-json.service -p SyslogIdentifier=resolvectl-monitor-json -p Type=notify resolvectl monitor --json=short # Check if all the zones are valid (zone-check always returns 0, so let's check # if it produces any errors/warnings) @@ -280,16 +281,16 @@ knotc reload TIMESTAMP=$(date '+%F %T') # Issue: https://github.com/systemd/systemd/issues/23951 # With IPv6 enabled -run getent -s resolve hosts ns1.unsigned.test -grep -qE "^fd00:dead:beef:cafe::1\s+ns1\.unsigned\.test" "$RUN_OUT" +run getent -s resolve ahosts ns1.unsigned.test +grep -qE "^fd00:dead:beef:cafe::1\s+STREAM\s+ns1\.unsigned\.test" "$RUN_OUT" monitor_check_rr "$TIMESTAMP" "ns1.unsigned.test IN AAAA fd00:dead:beef:cafe::1" # With IPv6 disabled # Issue: https://github.com/systemd/systemd/issues/23951 -# FIXME -#disable_ipv6 -#run getent -s resolve hosts ns1.unsigned.test -#grep -qE "^10\.0\.0\.1\s+ns1\.unsigned\.test" "$RUN_OUT" -#monitor_check_rr "$TIMESTAMP" "ns1.unsigned.test IN A 10.0.0.1" +disable_ipv6 +run getent -s resolve ahosts ns1.unsigned.test +grep -qE "^10\.0\.0\.1\s+STREAM\s+ns1\.unsigned\.test" "$RUN_OUT" +(! grep -qE "fd00:dead:beef:cafe::1" "$RUN_OUT") +monitor_check_rr "$TIMESTAMP" "ns1.unsigned.test IN A 10.0.0.1" enable_ipv6 # Issue: https://github.com/systemd/systemd/issues/18812 @@ -297,16 +298,17 @@ enable_ipv6 # Follow-up issue: https://github.com/systemd/systemd/issues/23152 # Follow-up PR: https://github.com/systemd/systemd/pull/23161 # With IPv6 enabled -run getent -s resolve hosts localhost -grep -qE "^::1\s+localhost" "$RUN_OUT" -run getent -s myhostname hosts localhost -grep -qE "^::1\s+localhost" "$RUN_OUT" +run getent -s resolve ahosts localhost +grep -qE "^::1\s+STREAM\s+localhost" "$RUN_OUT" +run getent -s myhostname ahosts localhost +grep -qE "^::1\s+STREAM\s+localhost" "$RUN_OUT" # With IPv6 disabled disable_ipv6 -run getent -s resolve hosts localhost -grep -qE "^127\.0\.0\.1\s+localhost" "$RUN_OUT" -run getent -s myhostname hosts localhost -grep -qE "^127\.0\.0\.1\s+localhost" "$RUN_OUT" +run getent -s resolve ahosts localhost +grep -qE "^127\.0\.0\.1\s+STREAM\s+localhost" "$RUN_OUT" +(! grep -qE "::1" "$RUN_OUT") +run getent -s myhostname ahosts localhost +grep -qE "^127\.0\.0\.1\s+STREAM\s+localhost" "$RUN_OUT" enable_ipv6 # Issue: https://github.com/systemd/systemd/issues/25088 @@ -557,10 +559,10 @@ systemctl stop resolvectl-monitor-json.service # Issue: https://github.com/systemd/systemd/issues/29580 (part #2) # # Check for any warnings regarding malformed messages -(! journalctl -u resolvectl-monitor.service -u reseolvectl-monitor-json.service -p warning --grep malformed) +(! journalctl -p warning --grep malformed _SYSTEMD_UNIT="resolvectl-monitor-json.service" + SYSLOG_IDENTIFIER="resolvectl-monitor-json") # Verify that all queries recorded by `resolvectl monitor --json` produced a valid JSON # with expected fields -journalctl -p info -o cat _SYSTEMD_UNIT="resolvectl-monitor-json.service" | while read -r line; do +journalctl -p info -o cat _SYSTEMD_UNIT="resolvectl-monitor-json.service" + SYSLOG_IDENTIFIER="resolvectl-monitor-json" | while read -r line; do # Check that both "question" and "answer" fields are arrays # # The expression is slightly more complicated due to the fact that the "answer" field is optional, diff --git a/tmpfiles.d/systemd.conf.in b/tmpfiles.d/systemd.conf.in index 11a45a3..d36f069 100644 --- a/tmpfiles.d/systemd.conf.in +++ b/tmpfiles.d/systemd.conf.in @@ -9,7 +9,7 @@ d /run/user 0755 root root - {% if ENABLE_UTMP %} -F! /run/utmp 0664 root utmp - +f+! /run/utmp 0664 root utmp - {% endif %} d /run/systemd/ask-password 0755 root root - @@ -26,16 +26,13 @@ Z /run/log/journal/%m ~2750 root systemd-journal - - {% if HAVE_ACL %} {% if ENABLE_ADM_GROUP and ENABLE_WHEEL_GROUP %} a+ /run/log/journal - - - - d:group::r-x,d:group:adm:r-x,d:group:wheel:r-x,group::r-x,group:adm:r-x,group:wheel:r-x -a+ /run/log/journal/%m - - - - d:group:adm:r-x,d:group:wheel:r-x,group:adm:r-x,group:wheel:r-x -a+ /run/log/journal/%m/*.journal* - - - - group:adm:r--,group:wheel:r-- +A+ /run/log/journal/%m - - - - d:group:adm:r-x,d:group:wheel:r-x,group:adm:r-X,group:wheel:r-X {% elif ENABLE_ADM_GROUP %} a+ /run/log/journal - - - - d:group::r-x,d:group:adm:r-x,group::r-x,group:adm:r-x -a+ /run/log/journal/%m - - - - d:group:adm:r-x,group:adm:r-x -a+ /run/log/journal/%m/*.journal* - - - - group:adm:r-- +A+ /run/log/journal/%m - - - - d:group:adm:r-x,group:adm:r-X {% elif ENABLE_WHEEL_GROUP %} a+ /run/log/journal - - - - d:group::r-x,d:group:wheel:r-x,group::r-x,group:wheel:r-x -a+ /run/log/journal/%m - - - - d:group:wheel:r-x,group:wheel:r-x -a+ /run/log/journal/%m/*.journal* - - - - group:wheel:r-- +A+ /run/log/journal/%m - - - - d:group:wheel:r-x,group:wheel:r-X {% endif %} {% endif %} diff --git a/tools/check-efi-alignment.py b/tools/check-efi-alignment.py new file mode 100755 index 0000000..bb33ac0 --- /dev/null +++ b/tools/check-efi-alignment.py @@ -0,0 +1,32 @@ +#!/usr/bin/python3 +# SPDX-License-Identifier: LGPL-2.1-or-later +# vi: set tw=110 sw=4 ts=4 et: + +import sys + +import pefile + + +def main(): + pe = pefile.PE(sys.argv[1], fast_load=True) + + for section in pe.sections: + name = section.Name.rstrip(b"\x00").decode() + file_addr = section.PointerToRawData + virt_addr = section.VirtualAddress + print(f"{name:10s} file=0x{file_addr:08x} virt=0x{virt_addr:08x}") + + if file_addr % 512 != 0: + print(f"File address of {name} section is not aligned to 512 bytes", file=sys.stderr) + return 1 + + if virt_addr % 512 != 0: + print(f"Virt address of {name} section is not aligned to 512 bytes", file=sys.stderr) + return 1 + +if __name__ == '__main__': + if len(sys.argv) != 2: + print(f"Usage: {sys.argv[0]} pe-image") + sys.exit(1) + + sys.exit(main()) diff --git a/tools/elf2efi.py b/tools/elf2efi.py index 54f64fa..cb1a284 100755 --- a/tools/elf2efi.py +++ b/tools/elf2efi.py @@ -26,6 +26,7 @@ import hashlib import io import os import pathlib +import sys import time import typing from ctypes import ( @@ -55,26 +56,26 @@ from elftools.elf.relocation import ( class PeCoffHeader(LittleEndianStructure): _fields_ = ( - ("Machine", c_uint16), - ("NumberOfSections", c_uint16), - ("TimeDateStamp", c_uint32), + ("Machine", c_uint16), + ("NumberOfSections", c_uint16), + ("TimeDateStamp", c_uint32), ("PointerToSymbolTable", c_uint32), - ("NumberOfSymbols", c_uint32), + ("NumberOfSymbols", c_uint32), ("SizeOfOptionalHeader", c_uint16), - ("Characteristics", c_uint16), + ("Characteristics", c_uint16), ) class PeDataDirectory(LittleEndianStructure): _fields_ = ( ("VirtualAddress", c_uint32), - ("Size", c_uint32), + ("Size", c_uint32), ) class PeRelocationBlock(LittleEndianStructure): _fields_ = ( - ("PageRVA", c_uint32), + ("PageRVA", c_uint32), ("BlockSize", c_uint32), ) @@ -86,62 +87,62 @@ class PeRelocationBlock(LittleEndianStructure): class PeRelocationEntry(LittleEndianStructure): _fields_ = ( ("Offset", c_uint16, 12), - ("Type", c_uint16, 4), + ("Type", c_uint16, 4), ) class PeOptionalHeaderStart(LittleEndianStructure): _fields_ = ( - ("Magic", c_uint16), - ("MajorLinkerVersion", c_uint8), - ("MinorLinkerVersion", c_uint8), - ("SizeOfCode", c_uint32), - ("SizeOfInitializedData", c_uint32), + ("Magic", c_uint16), + ("MajorLinkerVersion", c_uint8), + ("MinorLinkerVersion", c_uint8), + ("SizeOfCode", c_uint32), + ("SizeOfInitializedData", c_uint32), ("SizeOfUninitializedData", c_uint32), - ("AddressOfEntryPoint", c_uint32), - ("BaseOfCode", c_uint32), + ("AddressOfEntryPoint", c_uint32), + ("BaseOfCode", c_uint32), ) class PeOptionalHeaderMiddle(LittleEndianStructure): _fields_ = ( - ("SectionAlignment", c_uint32), - ("FileAlignment", c_uint32), + ("SectionAlignment", c_uint32), + ("FileAlignment", c_uint32), ("MajorOperatingSystemVersion", c_uint16), ("MinorOperatingSystemVersion", c_uint16), - ("MajorImageVersion", c_uint16), - ("MinorImageVersion", c_uint16), - ("MajorSubsystemVersion", c_uint16), - ("MinorSubsystemVersion", c_uint16), - ("Win32VersionValue", c_uint32), - ("SizeOfImage", c_uint32), - ("SizeOfHeaders", c_uint32), - ("CheckSum", c_uint32), - ("Subsystem", c_uint16), - ("DllCharacteristics", c_uint16), + ("MajorImageVersion", c_uint16), + ("MinorImageVersion", c_uint16), + ("MajorSubsystemVersion", c_uint16), + ("MinorSubsystemVersion", c_uint16), + ("Win32VersionValue", c_uint32), + ("SizeOfImage", c_uint32), + ("SizeOfHeaders", c_uint32), + ("CheckSum", c_uint32), + ("Subsystem", c_uint16), + ("DllCharacteristics", c_uint16), ) class PeOptionalHeaderEnd(LittleEndianStructure): _fields_ = ( - ("LoaderFlags", c_uint32), - ("NumberOfRvaAndSizes", c_uint32), - ("ExportTable", PeDataDirectory), - ("ImportTable", PeDataDirectory), - ("ResourceTable", PeDataDirectory), - ("ExceptionTable", PeDataDirectory), - ("CertificateTable", PeDataDirectory), - ("BaseRelocationTable", PeDataDirectory), - ("Debug", PeDataDirectory), - ("Architecture", PeDataDirectory), - ("GlobalPtr", PeDataDirectory), - ("TLSTable", PeDataDirectory), - ("LoadConfigTable", PeDataDirectory), - ("BoundImport", PeDataDirectory), - ("IAT", PeDataDirectory), + ("LoaderFlags", c_uint32), + ("NumberOfRvaAndSizes", c_uint32), + ("ExportTable", PeDataDirectory), + ("ImportTable", PeDataDirectory), + ("ResourceTable", PeDataDirectory), + ("ExceptionTable", PeDataDirectory), + ("CertificateTable", PeDataDirectory), + ("BaseRelocationTable", PeDataDirectory), + ("Debug", PeDataDirectory), + ("Architecture", PeDataDirectory), + ("GlobalPtr", PeDataDirectory), + ("TLSTable", PeDataDirectory), + ("LoadConfigTable", PeDataDirectory), + ("BoundImport", PeDataDirectory), + ("IAT", PeDataDirectory), ("DelayImportDescriptor", PeDataDirectory), - ("CLRRuntimeHeader", PeDataDirectory), - ("Reserved", PeDataDirectory), + ("CLRRuntimeHeader", PeDataDirectory), + ("Reserved", PeDataDirectory), ) @@ -152,44 +153,44 @@ class PeOptionalHeader(LittleEndianStructure): class PeOptionalHeader32(PeOptionalHeader): _anonymous_ = ("Start", "Middle", "End") _fields_ = ( - ("Start", PeOptionalHeaderStart), - ("BaseOfData", c_uint32), - ("ImageBase", c_uint32), - ("Middle", PeOptionalHeaderMiddle), + ("Start", PeOptionalHeaderStart), + ("BaseOfData", c_uint32), + ("ImageBase", c_uint32), + ("Middle", PeOptionalHeaderMiddle), ("SizeOfStackReserve", c_uint32), - ("SizeOfStackCommit", c_uint32), - ("SizeOfHeapReserve", c_uint32), - ("SizeOfHeapCommit", c_uint32), - ("End", PeOptionalHeaderEnd), + ("SizeOfStackCommit", c_uint32), + ("SizeOfHeapReserve", c_uint32), + ("SizeOfHeapCommit", c_uint32), + ("End", PeOptionalHeaderEnd), ) class PeOptionalHeader32Plus(PeOptionalHeader): _anonymous_ = ("Start", "Middle", "End") _fields_ = ( - ("Start", PeOptionalHeaderStart), - ("ImageBase", c_uint64), - ("Middle", PeOptionalHeaderMiddle), + ("Start", PeOptionalHeaderStart), + ("ImageBase", c_uint64), + ("Middle", PeOptionalHeaderMiddle), ("SizeOfStackReserve", c_uint64), - ("SizeOfStackCommit", c_uint64), - ("SizeOfHeapReserve", c_uint64), - ("SizeOfHeapCommit", c_uint64), - ("End", PeOptionalHeaderEnd), + ("SizeOfStackCommit", c_uint64), + ("SizeOfHeapReserve", c_uint64), + ("SizeOfHeapCommit", c_uint64), + ("End", PeOptionalHeaderEnd), ) class PeSection(LittleEndianStructure): _fields_ = ( - ("Name", c_char * 8), - ("VirtualSize", c_uint32), - ("VirtualAddress", c_uint32), - ("SizeOfRawData", c_uint32), - ("PointerToRawData", c_uint32), + ("Name", c_char * 8), + ("VirtualSize", c_uint32), + ("VirtualAddress", c_uint32), + ("SizeOfRawData", c_uint32), + ("PointerToRawData", c_uint32), ("PointerToRelocations", c_uint32), ("PointerToLinenumbers", c_uint32), - ("NumberOfRelocations", c_uint16), - ("NumberOfLinenumbers", c_uint16), - ("Characteristics", c_uint32), + ("NumberOfRelocations", c_uint16), + ("NumberOfLinenumbers", c_uint16), + ("Characteristics", c_uint32), ) def __init__(self): @@ -206,12 +207,13 @@ assert sizeof(PeOptionalHeader32Plus) == 240 PE_CHARACTERISTICS_RX = 0x60000020 # CNT_CODE|MEM_READ|MEM_EXECUTE PE_CHARACTERISTICS_RW = 0xC0000040 # CNT_INITIALIZED_DATA|MEM_READ|MEM_WRITE -PE_CHARACTERISTICS_R = 0x40000040 # CNT_INITIALIZED_DATA|MEM_READ +PE_CHARACTERISTICS_R = 0x40000040 # CNT_INITIALIZED_DATA|MEM_READ IGNORE_SECTIONS = [ ".eh_frame", ".eh_frame_hdr", ".ARM.exidx", + ".relro_padding", ] IGNORE_SECTION_TYPES = [ @@ -246,9 +248,12 @@ def align_down(x: int, align: int) -> int: def next_section_address(sections: typing.List[PeSection]) -> int: - return align_to( - sections[-1].VirtualAddress + sections[-1].VirtualSize, SECTION_ALIGNMENT - ) + return align_to(sections[-1].VirtualAddress + sections[-1].VirtualSize, + SECTION_ALIGNMENT) + + +class BadSectionError(ValueError): + "One of the sections is in a bad state" def iter_copy_sections(elf: ELFFile) -> typing.Iterator[PeSection]: @@ -261,8 +266,9 @@ def iter_copy_sections(elf: ELFFile) -> typing.Iterator[PeSection]: relro = None for elf_seg in elf.iter_segments(): if elf_seg["p_type"] == "PT_LOAD" and elf_seg["p_align"] != SECTION_ALIGNMENT: - raise RuntimeError("ELF segments are not properly aligned.") - elif elf_seg["p_type"] == "PT_GNU_RELRO": + raise BadSectionError(f"ELF segment {elf_seg['p_type']} is not properly aligned" + f" ({elf_seg['p_align']} != {SECTION_ALIGNMENT})") + if elf_seg["p_type"] == "PT_GNU_RELRO": relro = elf_seg for elf_s in elf.iter_sections(): @@ -270,10 +276,14 @@ def iter_copy_sections(elf: ELFFile) -> typing.Iterator[PeSection]: elf_s["sh_flags"] & SH_FLAGS.SHF_ALLOC == 0 or elf_s["sh_type"] in IGNORE_SECTION_TYPES or elf_s.name in IGNORE_SECTIONS + or elf_s["sh_size"] == 0 ): continue if elf_s["sh_type"] not in ["SHT_PROGBITS", "SHT_NOBITS"]: - raise RuntimeError(f"Unknown section {elf_s.name}.") + raise BadSectionError(f"Unknown section {elf_s.name} with type {elf_s['sh_type']}") + if elf_s.name == '.got': + # FIXME: figure out why those sections are inserted + print("WARNING: Non-empty .got section", file=sys.stderr) if elf_s["sh_flags"] & SH_FLAGS.SHF_EXECINSTR: rwx = PE_CHARACTERISTICS_RX @@ -305,7 +315,7 @@ def iter_copy_sections(elf: ELFFile) -> typing.Iterator[PeSection]: def convert_sections(elf: ELFFile, opt: PeOptionalHeader) -> typing.List[PeSection]: - last_vma = 0 + last_vma = (0, 0) sections = [] for pe_s in iter_copy_sections(elf): @@ -325,10 +335,11 @@ def convert_sections(elf: ELFFile, opt: PeOptionalHeader) -> typing.List[PeSecti PE_CHARACTERISTICS_R: b".rodata", }[pe_s.Characteristics] - # This can happen if not building with `-z separate-code`. - if pe_s.VirtualAddress < last_vma: - raise RuntimeError("Overlapping PE sections.") - last_vma = pe_s.VirtualAddress + pe_s.VirtualSize + # This can happen if not building with '-z separate-code'. + if pe_s.VirtualAddress < sum(last_vma): + raise BadSectionError(f"Section {pe_s.Name.decode()!r} @0x{pe_s.VirtualAddress:x} overlaps" + f" previous section @0x{last_vma[0]:x}+0x{last_vma[1]:x}=@0x{sum(last_vma):x}") + last_vma = (pe_s.VirtualAddress, pe_s.VirtualSize) if pe_s.Name == b".text": opt.BaseOfCode = pe_s.VirtualAddress @@ -355,9 +366,9 @@ def copy_sections( if not elf_s: continue if elf_s.data_alignment > 1 and SECTION_ALIGNMENT % elf_s.data_alignment != 0: - raise RuntimeError(f"ELF section {name} is not aligned.") + raise BadSectionError(f"ELF section {name} is not aligned") if elf_s["sh_flags"] & (SH_FLAGS.SHF_EXECINSTR | SH_FLAGS.SHF_WRITE) != 0: - raise RuntimeError(f"ELF section {name} is not read-only data.") + raise BadSectionError(f"ELF section {name} is not read-only data") pe_s = PeSection() pe_s.Name = name.encode() @@ -376,12 +387,8 @@ def apply_elf_relative_relocation( sections: typing.List[PeSection], addend_size: int, ): - # fmt: off - [target] = [ - pe_s for pe_s in sections - if pe_s.VirtualAddress <= reloc["r_offset"] < pe_s.VirtualAddress + len(pe_s.data) - ] - # fmt: on + [target] = [pe_s for pe_s in sections + if pe_s.VirtualAddress <= reloc["r_offset"] < pe_s.VirtualAddress + len(pe_s.data)] addend_offset = reloc["r_offset"] - target.VirtualAddress @@ -425,9 +432,10 @@ def convert_elf_reloc_table( continue if reloc["r_info_type"] == RELATIVE_RELOC: - apply_elf_relative_relocation( - reloc, elf_image_base, sections, elf.elfclass // 8 - ) + apply_elf_relative_relocation(reloc, + elf_image_base, + sections, + elf.elfclass // 8) # Now that the ELF relocation has been applied, we can create a PE relocation. block_rva = reloc["r_offset"] & ~0xFFF @@ -442,7 +450,7 @@ def convert_elf_reloc_table( continue - raise RuntimeError(f"Unsupported relocation {reloc}") + raise BadSectionError(f"Unsupported relocation {reloc}") def convert_elf_relocations( @@ -453,27 +461,25 @@ def convert_elf_relocations( ) -> typing.Optional[PeSection]: dynamic = elf.get_section_by_name(".dynamic") if dynamic is None: - raise RuntimeError("ELF .dynamic section is missing.") + raise BadSectionError("ELF .dynamic section is missing") [flags_tag] = dynamic.iter_tags("DT_FLAGS_1") if not flags_tag["d_val"] & ENUM_DT_FLAGS_1["DF_1_PIE"]: - raise RuntimeError("ELF file is not a PIE.") + raise ValueError("ELF file is not a PIE") # This checks that the ELF image base is 0. symtab = elf.get_section_by_name(".symtab") if symtab: exe_start = symtab.get_symbol_by_name("__executable_start") if exe_start and exe_start[0]["st_value"] != 0: - raise RuntimeError("Unexpected ELF image base.") - - opt.SizeOfHeaders = align_to( - PE_OFFSET - + len(PE_MAGIC) - + sizeof(PeCoffHeader) - + sizeof(opt) - + sizeof(PeSection) * max(len(sections) + 1, minimum_sections), - FILE_ALIGNMENT, - ) + raise ValueError("Unexpected ELF image base") + + opt.SizeOfHeaders = align_to(PE_OFFSET + + len(PE_MAGIC) + + sizeof(PeCoffHeader) + + sizeof(opt) + + sizeof(PeSection) * max(len(sections) + 1, minimum_sections), + FILE_ALIGNMENT) # We use the basic VMA layout from the ELF image in the PE image. This could cause the first # section to overlap the PE image headers during runtime at VMA 0. We can simply apply a fixed @@ -482,9 +488,8 @@ def convert_elf_relocations( # the ELF portions of the image. segment_offset = 0 if sections[0].VirtualAddress < opt.SizeOfHeaders: - segment_offset = align_to( - opt.SizeOfHeaders - sections[0].VirtualAddress, SECTION_ALIGNMENT - ) + segment_offset = align_to(opt.SizeOfHeaders - sections[0].VirtualAddress, + SECTION_ALIGNMENT) opt.AddressOfEntryPoint = elf["e_entry"] + segment_offset opt.BaseOfCode += segment_offset @@ -494,10 +499,12 @@ def convert_elf_relocations( pe_reloc_blocks: typing.Dict[int, PeRelocationBlock] = {} for reloc_type, reloc_table in dynamic.get_relocation_tables().items(): if reloc_type not in ["REL", "RELA"]: - raise RuntimeError("Unsupported relocation type {elf_reloc_type}.") - convert_elf_reloc_table( - elf, reloc_table, opt.ImageBase + segment_offset, sections, pe_reloc_blocks - ) + raise BadSectionError(f"Unsupported relocation type {reloc_type}") + convert_elf_reloc_table(elf, + reloc_table, + opt.ImageBase + segment_offset, + sections, + pe_reloc_blocks) for pe_s in sections: pe_s.VirtualAddress += segment_offset @@ -517,9 +524,7 @@ def convert_elf_relocations( block.entries.append(PeRelocationEntry()) block.PageRVA += segment_offset - block.BlockSize = ( - sizeof(PeRelocationBlock) + sizeof(PeRelocationEntry) * n_relocs - ) + block.BlockSize = sizeof(PeRelocationBlock) + sizeof(PeRelocationEntry) * n_relocs data += block for entry in sorted(block.entries, key=lambda e: e.Offset): data += entry @@ -539,7 +544,10 @@ def convert_elf_relocations( def write_pe( - file, coff: PeCoffHeader, opt: PeOptionalHeader, sections: typing.List[PeSection] + file, + coff: PeCoffHeader, + opt: PeOptionalHeader, + sections: typing.List[PeSection], ): file.write(b"MZ") file.seek(0x3C, io.SEEK_SET) @@ -552,8 +560,8 @@ def write_pe( offset = opt.SizeOfHeaders for pe_s in sorted(sections, key=lambda s: s.VirtualAddress): if pe_s.VirtualAddress < opt.SizeOfHeaders: - # Linker script should make sure this does not happen. - raise RuntimeError(f"Section {pe_s.Name} overlapping PE headers.") + raise BadSectionError(f"Section {pe_s.Name} @0x{pe_s.VirtualAddress:x} overlaps" + " PE headers ending at 0x{opt.SizeOfHeaders:x}") pe_s.PointerToRawData = offset file.write(pe_s) @@ -571,9 +579,9 @@ def write_pe( def elf2efi(args: argparse.Namespace): elf = ELFFile(args.ELF) if not elf.little_endian: - raise RuntimeError("ELF file is not little-endian.") + raise ValueError("ELF file is not little-endian") if elf["e_type"] not in ["ET_DYN", "ET_EXEC"]: - raise RuntimeError("Unsupported ELF type.") + raise ValueError(f"Unsupported ELF type {elf['e_type']}") pe_arch = { "EM_386": 0x014C, @@ -584,7 +592,7 @@ def elf2efi(args: argparse.Namespace): "EM_X86_64": 0x8664, }.get(elf["e_machine"]) if pe_arch is None: - raise RuntimeError(f"Unsupported ELF arch {elf['e_machine']}") + raise ValueError(f"Unsupported ELF architecture {elf['e_machine']}") coff = PeCoffHeader() opt = PeOptionalHeader32() if elf.elfclass == 32 else PeOptionalHeader32Plus() @@ -637,7 +645,7 @@ def elf2efi(args: argparse.Namespace): write_pe(args.PE, coff, opt, sections) -def main(): +def create_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description="Convert ELF binaries to PE/EFI") parser.add_argument( "--version-major", @@ -691,7 +699,11 @@ def main(): default="", help="Copy these sections if found", ) + return parser + +def main(): + parser = create_parser() elf2efi(parser.parse_args()) diff --git a/units/systemd-modules-load.service.in b/units/systemd-modules-load.service.in index 0fe6740..ad262fa 100644 --- a/units/systemd-modules-load.service.in +++ b/units/systemd-modules-load.service.in @@ -20,7 +20,9 @@ ConditionDirectoryNotEmpty=|/usr/local/lib/modules-load.d ConditionDirectoryNotEmpty=|/etc/modules-load.d ConditionDirectoryNotEmpty=|/run/modules-load.d ConditionKernelCommandLine=|modules-load +ConditionKernelCommandLine=|modules_load ConditionKernelCommandLine=|rd.modules-load +ConditionKernelCommandLine=|rd.modules_load [Service] Type=oneshot diff --git a/units/systemd-tpm2-setup-early.service.in b/units/systemd-tpm2-setup-early.service.in index c1597ea..6996efe 100644 --- a/units/systemd-tpm2-setup-early.service.in +++ b/units/systemd-tpm2-setup-early.service.in @@ -12,6 +12,7 @@ Description=TPM2 SRK Setup (Early) Documentation=man:systemd-tpm2-setup.service(8) DefaultDependencies=no Conflicts=shutdown.target +After=tpm2.target systemd-pcrphase-initrd.service Before=sysinit.target shutdown.target ConditionSecurity=measured-uki ConditionPathExists=!/run/systemd/tpm2-srk-public-key.pem @@ -19,4 +20,4 @@ ConditionPathExists=!/run/systemd/tpm2-srk-public-key.pem [Service] Type=oneshot RemainAfterExit=yes -ExecStart={{LIBEXECDIR}}/systemd-tpm2-setup --early=yes +ExecStart={{LIBEXECDIR}}/systemd-tpm2-setup --early=yes --graceful diff --git a/units/systemd-tpm2-setup.service.in b/units/systemd-tpm2-setup.service.in index 6c99f3a..8c1851f 100644 --- a/units/systemd-tpm2-setup.service.in +++ b/units/systemd-tpm2-setup.service.in @@ -21,4 +21,4 @@ ConditionPathExists=!/etc/initrd-release [Service] Type=oneshot RemainAfterExit=yes -ExecStart={{LIBEXECDIR}}/systemd-tpm2-setup +ExecStart={{LIBEXECDIR}}/systemd-tpm2-setup --graceful -- cgit v1.2.3